Bitcoin Core 31.99.0
P2P Digital Currency
nanobench.h
Go to the documentation of this file.
1// __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2// | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3// | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4//
5// Microbenchmark framework for C++11/14/17/20
6// https://github.com/martinus/nanobench
7//
8// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9// SPDX-License-Identifier: MIT
10// Copyright (c) 2019-2023 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
11//
12// Permission is hereby granted, free of charge, to any person obtaining a copy
13// of this software and associated documentation files (the "Software"), to deal
14// in the Software without restriction, including without limitation the rights
15// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16// copies of the Software, and to permit persons to whom the Software is
17// furnished to do so, subject to the following conditions:
18//
19// The above copyright notice and this permission notice shall be included in all
20// copies or substantial portions of the Software.
21//
22// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28// SOFTWARE.
29
30#ifndef ANKERL_NANOBENCH_H_INCLUDED
31#define ANKERL_NANOBENCH_H_INCLUDED
32
33// see https://semver.org/
34#define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35#define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes
36#define ANKERL_NANOBENCH_VERSION_PATCH 11 // backwards-compatible bug fixes
37
39// public facing api - as minimal as possible
41
42#include <chrono> // high_resolution_clock
43#include <cassert> // assert
44#include <cstdint>
45#include <cstring> // memcpy
46// IWYU will only see this header with ANKERL_NANOBENCH_IMPLEMENT defined, which
47// makes it suggest removing forward declarations for the Input/output library.
48// IWYU pragma: begin_keep
49#include <iosfwd> // for std::ostream* custom output target in Config
50// IWYU pragma: end_keep
51#include <limits>
52#include <ratio>
53#include <string> // all names
54#include <type_traits>
55#include <unordered_map> // holds context information of results
56#include <utility>
57#include <vector> // holds all results
58
59#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
60
61#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
62#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
63#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
64#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
65#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
66
67#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
68# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
69#else
70# define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
71#endif
72
73#if defined(__clang__)
74# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
75 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
76# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
77#else
78# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
79# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
80#endif
81
82#if defined(__GNUC__)
83# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
84# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
85#else
86# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
87# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
88#endif
89
90#if defined(ANKERL_NANOBENCH_LOG_ENABLED)
91# include <iostream>
92# define ANKERL_NANOBENCH_LOG(x) \
93 do { \
94 std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
95 } while (0)
96#else
97# define ANKERL_NANOBENCH_LOG(x) \
98 do { \
99 } while (0)
100#endif
101
102#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
103#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
104# include <linux/version.h>
105# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
106// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3
107// PERF_FLAG_FD_CLOEXEC since kernel 3.14
108# undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
109# define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
110# endif
111#endif
112
113#if defined(__clang__)
114# define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
115#else
116# define ANKERL_NANOBENCH_NO_SANITIZE(...)
117#endif
118
119#if defined(_MSC_VER)
120# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
121#else
122# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
123#endif
124
125// workaround missing "is_trivially_copyable" in g++ < 5.0
126// See https://stackoverflow.com/a/31798726/48181
127#if defined(__GNUC__) && __GNUC__ < 5
128# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
129#else
130# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
131#endif
132
133// noexcept may be missing for std::string.
134// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58265
135#define ANKERL_NANOBENCH_PRIVATE_NOEXCEPT_STRING_MOVE() std::is_nothrow_move_assignable<std::string>::value
136
137// declarations ///////////////////////////////////////////////////////////////////////////////////
138
139// As definitions follow these declarations within this
140// header, IWYU considers some of them redundant and
141// suggests removing them. Disable this IWYU behavior.
142// IWYU pragma: begin_keep
143
144namespace ankerl {
145namespace nanobench {
146
147using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
148 std::chrono::steady_clock>::type;
149class Bench;
150struct Config;
151class Result;
152class Rng;
153class BigO;
154
155namespace detail {
156template <typename SetupOp>
157class SetupRunner;
158} // namespace detail
159
310void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
311void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
312
321void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
322void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
323
324// Contains mustache-like templates
325namespace templates {
326
336char const* csv() noexcept;
337
348char const* htmlBoxplot() noexcept;
349
356char const* pyperf() noexcept;
357
367char const* json() noexcept;
368
369} // namespace templates
370
371namespace detail {
372
373template <typename T>
374struct PerfCountSet;
375
376class IterationLogic;
377class PerformanceCounters;
378
379#if ANKERL_NANOBENCH(PERF_COUNTERS)
380class LinuxPerformanceCounters;
381#endif
382
383} // namespace detail
384} // namespace nanobench
385} // namespace ankerl
386
387// IWYU pragma: end_keep
388
389// definitions ////////////////////////////////////////////////////////////////////////////////////
390
391namespace ankerl {
392namespace nanobench {
393namespace detail {
394
395template <typename T>
403};
404
405} // namespace detail
406
407ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
408struct Config {
409 // actual benchmark config
410 std::string mBenchmarkTitle = "benchmark"; // NOLINT(misc-non-private-member-variables-in-classes)
411 std::string mBenchmarkName = "noname"; // NOLINT(misc-non-private-member-variables-in-classes)
412 std::string mUnit = "op"; // NOLINT(misc-non-private-member-variables-in-classes)
413 double mBatch = 1.0; // NOLINT(misc-non-private-member-variables-in-classes)
414 double mComplexityN = -1.0; // NOLINT(misc-non-private-member-variables-in-classes)
415 size_t mNumEpochs = 11; // NOLINT(misc-non-private-member-variables-in-classes)
416 size_t mClockResolutionMultiple = static_cast<size_t>(1000); // NOLINT(misc-non-private-member-variables-in-classes)
417 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100); // NOLINT(misc-non-private-member-variables-in-classes)
418 std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1); // NOLINT(misc-non-private-member-variables-in-classes)
419 uint64_t mMinEpochIterations{1}; // NOLINT(misc-non-private-member-variables-in-classes)
420 // If not 0, run *exactly* these number of iterations per epoch.
421 uint64_t mEpochIterations{0}; // NOLINT(misc-non-private-member-variables-in-classes)
422 uint64_t mWarmup = 0; // NOLINT(misc-non-private-member-variables-in-classes)
423 std::ostream* mOut = nullptr; // NOLINT(misc-non-private-member-variables-in-classes)
424 std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1}; // NOLINT(misc-non-private-member-variables-in-classes)
425 std::string mTimeUnitName = "ns"; // NOLINT(misc-non-private-member-variables-in-classes)
426 bool mShowPerformanceCounters = true; // NOLINT(misc-non-private-member-variables-in-classes)
427 bool mIsRelative = false; // NOLINT(misc-non-private-member-variables-in-classes)
428 std::unordered_map<std::string, std::string> mContext{}; // NOLINT(misc-non-private-member-variables-in-classes)
429
432 Config& operator=(Config const& other);
433 Config& operator=(Config&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
434 Config(Config const& other);
435 Config(Config&& other) noexcept;
436};
437ANKERL_NANOBENCH(IGNORE_PADDED_POP)
438
439// Result returned after a benchmark has finished. Can be used as a baseline for relative().
440ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
441class Result {
442public:
443 enum class Measure : size_t {
444 elapsed,
445 iterations,
446 pagefaults,
447 cpucycles,
448 contextswitches,
449 instructions,
450 branchinstructions,
451 branchmisses,
452 _size
453 };
454
455 explicit Result(Config benchmarkConfig);
456
458 Result& operator=(Result const& other);
459 Result& operator=(Result&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
460 Result(Result const& other);
461 Result(Result&& other) noexcept;
462
463 // adds new measurement results
464 // all values are scaled by iters (except iters...)
465 void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
466
467 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
468
469 ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
470 ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
471 ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
472 ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
473 ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
474 ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
475 ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
476 ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const* variableName) const;
477 ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const& variableName) const;
478
479 ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
480 ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
481 ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
482 ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
483
484 // Finds string, if not found, returns _size.
485 static Measure fromString(std::string const& str);
486
487private:
488 Config mConfig{};
489 std::vector<std::vector<double>> mNameToMeasurements{};
490};
491ANKERL_NANOBENCH(IGNORE_PADDED_POP)
492
493
510class Rng final {
511public:
515 using result_type = uint64_t;
516
517 static constexpr uint64_t(min)();
518 static constexpr uint64_t(max)();
519
525 Rng(Rng const&) = delete;
526
530 Rng& operator=(Rng const&) = delete;
531
532 // moving is ok
533 Rng(Rng&&) noexcept = default;
534 Rng& operator=(Rng&&) noexcept = default;
535 ~Rng() noexcept = default;
536
545
562 explicit Rng(uint64_t seed) noexcept;
563 Rng(uint64_t x, uint64_t y) noexcept;
564 explicit Rng(std::vector<uint64_t> const& data);
565
569 ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
570
578 inline uint64_t operator()() noexcept;
579
580 // This is slightly biased. See
581
596 inline uint32_t bounded(uint32_t range) noexcept;
597
598 // random double in range [0, 1(
599 // see http://prng.di.unimi.it/
600
607 inline double uniform01() noexcept;
608
616 template <typename Container>
617 void shuffle(Container& container) noexcept;
618
625 ANKERL_NANOBENCH(NODISCARD) std::vector<uint64_t> state() const;
626
627private:
628 static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
629
630 uint64_t mX;
631 uint64_t mY;
632};
633
648ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
649class Bench {
650public:
655
656 Bench(Bench&& other) noexcept;
657 Bench& operator=(Bench&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
658 Bench(Bench const& other);
659 Bench& operator=(Bench const& other);
660 ~Bench() noexcept;
661
680 template <typename Op>
681 ANKERL_NANOBENCH(NOINLINE)
682 Bench& run(char const* benchmarkName, Op&& op);
683
684 template <typename Op>
685 ANKERL_NANOBENCH(NOINLINE)
686 Bench& run(std::string const& benchmarkName, Op&& op);
687
692 template <typename Op>
693 ANKERL_NANOBENCH(NOINLINE)
694 Bench& run(Op&& op);
695
701 Bench& title(char const* benchmarkTitle);
702 Bench& title(std::string const& benchmarkTitle);
703
707 ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
708
710 Bench& name(char const* benchmarkName);
711 Bench& name(std::string const& benchmarkName);
712 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
713
726 Bench& context(char const* variableName, char const* variableValue);
727 Bench& context(std::string const& variableName, std::string const& variableValue);
728
737 Bench& clearContext();
738
748 template <typename T>
749 Bench& batch(T b) noexcept;
750 ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
751
760 Bench& unit(char const* unit);
761 Bench& unit(std::string const& unit);
762 ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
763
773 Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
774 ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
775 ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
776
784 Bench& output(std::ostream* outstream) noexcept;
785 ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
786
807 Bench& clockResolutionMultiple(size_t multiple) noexcept;
808 ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
809
825 Bench& epochs(size_t numEpochs) noexcept;
826 ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
827
838 Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
839 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
840
851 Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
852 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
853
864 Bench& minEpochIterations(uint64_t numIters) noexcept;
865 ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
866
873 Bench& epochIterations(uint64_t numIters) noexcept;
874 ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
875
885 Bench& warmup(uint64_t numWarmupIters) noexcept;
886 ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
887
905 Bench& relative(bool isRelativeEnabled) noexcept;
906 ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
907
916 Bench& performanceCounters(bool showPerformanceCounters) noexcept;
917 ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
918
927 ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
928
936 template <typename Arg>
938
953 template <typename T>
954 Bench& complexityN(T n) noexcept;
955 ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
956
988 std::vector<BigO> complexityBigO() const;
989
1013 template <typename Op>
1014 BigO complexityBigO(char const* name, Op op) const;
1015
1016 template <typename Op>
1017 BigO complexityBigO(std::string const& name, Op op) const;
1018
1026 Bench& render(char const* templateContent, std::ostream& os);
1027 Bench& render(std::string const& templateContent, std::ostream& os);
1028
1029 Bench& config(Config const& benchmarkConfig);
1030 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
1031
1037 template <typename SetupOp>
1038 detail::SetupRunner<SetupOp> setup(SetupOp setupOp);
1039
1040private:
1041 template <typename SetupOp, typename Op>
1042 Bench& runImpl(SetupOp& setupOp, Op&& op);
1043
1044 template <typename SetupOp>
1045 friend class detail::SetupRunner;
1046
1047 Config mConfig{};
1048 std::vector<Result> mResults{};
1049};
1050ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1051
1052
1058template <typename Arg>
1059void doNotOptimizeAway(Arg&& arg);
1060
1061namespace detail {
1062
1063#if defined(_MSC_VER)
1064void doNotOptimizeAwaySink(void const*);
1065
1066template <typename T>
1067void doNotOptimizeAway(T const& val);
1068
1069#else
1070
1071// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
1072// this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
1073// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L443-L446
1074template <typename T>
1075void doNotOptimizeAway(T const& val) {
1076 // NOLINTNEXTLINE(hicpp-no-assembler)
1077 asm volatile("" : : "r,m"(val) : "memory");
1078}
1079
1080template <typename T>
1081void doNotOptimizeAway(T& val) {
1082# if defined(__clang__)
1083 // NOLINTNEXTLINE(hicpp-no-assembler)
1084 asm volatile("" : "+r,m"(val) : : "memory");
1085# else
1086 // NOLINTNEXTLINE(hicpp-no-assembler)
1087 asm volatile("" : "+m,r"(val) : : "memory");
1088# endif
1089}
1090#endif
1091
1092// internally used, but visible because run() is templated.
1093// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1094// having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
1095ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1097public:
1098 explicit IterationLogic(Bench const& bench);
1104
1105 ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1106 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1107 void moveResultTo(std::vector<Result>& results) noexcept;
1108
1109private:
1110 struct Impl;
1111 Impl* mPimpl;
1112};
1113ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1114
1115ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1117public:
1122
1125
1128 void updateResults(uint64_t numIters);
1129
1130 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1131 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1132
1133private:
1134#if ANKERL_NANOBENCH(PERF_COUNTERS)
1135 LinuxPerformanceCounters* mPc = nullptr;
1136#endif
1139};
1140ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1141
1142// Gets the singleton
1144
1145} // namespace detail
1146
1147class BigO {
1148public:
1149 using RangeMeasure = std::vector<std::pair<double, double>>;
1150
1151 template <typename Op>
1153 for (auto& rangeMeasure : data) {
1154 rangeMeasure.first = op(rangeMeasure.first);
1155 }
1156 return data;
1157 }
1158
1159 static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1160
1161 template <typename Op>
1162 BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1163 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1164
1165 template <typename Op>
1166 BigO(std::string bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1167 : BigO(std::move(bigOName), mapRangeMeasure(rangeMeasure, rangeToN)) {}
1168
1169 BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1170 BigO(std::string bigOName, RangeMeasure const& scaledRangeMeasure);
1171 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1172 ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1173 ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1174 ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1175
1176private:
1177 std::string mName{};
1178 double mConstant{};
1179 double mNormalizedRootMeanSquare{};
1180};
1181std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1182std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1183
1184} // namespace nanobench
1185} // namespace ankerl
1186
1187// implementation /////////////////////////////////////////////////////////////////////////////////
1188
1189namespace ankerl {
1190namespace nanobench {
1191
1192constexpr uint64_t(Rng::min)() {
1193 return 0;
1194}
1195
1196constexpr uint64_t(Rng::max)() {
1197 return (std::numeric_limits<uint64_t>::max)();
1198}
1199
1200ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1201uint64_t Rng::operator()() noexcept {
1202 auto x = mX;
1203
1204 mX = UINT64_C(15241094284759029579) * mY;
1205 mY = rotl(mY - x, 27);
1206
1207 return x;
1208}
1209
1210ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1211uint32_t Rng::bounded(uint32_t range) noexcept {
1212 uint64_t const r32 = static_cast<uint32_t>(operator()());
1213 auto multiresult = r32 * range;
1214 return static_cast<uint32_t>(multiresult >> 32U);
1215}
1216
1217double Rng::uniform01() noexcept {
1218 auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1219 // can't use union in c++ here for type puning, it's undefined behavior.
1220 // std::memcpy is optimized anyways.
1221 double d{};
1222 std::memcpy(&d, &i, sizeof(double));
1223 return d - 1.0;
1224}
1225
1226template <typename Container>
1227void Rng::shuffle(Container& container) noexcept {
1228 auto i = container.size();
1229 while (i > 1U) {
1230 using std::swap;
1231 auto n = operator()();
1232 // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80)
1233 auto b1 = static_cast<decltype(i)>((static_cast<uint32_t>(n) * static_cast<uint64_t>(i)) >> 32U);
1234 swap(container[--i], container[b1]);
1235
1236 auto b2 = static_cast<decltype(i)>(((n >> 32U) * static_cast<uint64_t>(i)) >> 32U);
1237 swap(container[--i], container[b2]);
1238 }
1239}
1240
1241ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1242constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1243 return (x << k) | (x >> (64U - k));
1244}
1245
1246namespace detail {
1247
1248template <typename SetupOp>
1250public:
1251 explicit SetupRunner(SetupOp setupOp, Bench& bench)
1252 : mSetupOp(std::move(setupOp))
1253 , mBench(bench) {}
1254
1255 template <typename Op>
1257 Bench& run(Op&& op) {
1258 assert((mBench.epochIterations() <= 1) &&
1259 "setup() runs once per epoch, not once per iteration; it requires epochIterations(1)");
1260 mBench.epochIterations(1);
1261 return mBench.runImpl(mSetupOp, std::forward<Op>(op));
1262 }
1263
1264private:
1265 SetupOp mSetupOp;
1267};
1268} // namespace detail
1269
1270template <typename Op>
1272Bench& Bench::run(Op&& op) {
1273 auto setupOp = [] {};
1274 return runImpl(setupOp, std::forward<Op>(op));
1275}
1276
1277template <typename SetupOp, typename Op>
1279Bench& Bench::runImpl(SetupOp& setupOp, Op&& op) {
1280 // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1281 detail::IterationLogic iterationLogic(*this);
1282 auto& pc = detail::performanceCounters();
1283
1284 while (auto n = iterationLogic.numIters()) {
1285 setupOp();
1286
1287 pc.beginMeasure();
1288 Clock::time_point const before = Clock::now();
1289 while (n-- > 0) {
1290 op();
1291 }
1292 Clock::time_point const after = Clock::now();
1293 pc.endMeasure();
1294 pc.updateResults(iterationLogic.numIters());
1295 iterationLogic.add(after - before, pc);
1296 }
1297 iterationLogic.moveResultTo(mResults);
1298 return *this;
1299}
1300
1301template <typename SetupOp>
1303 return detail::SetupRunner<SetupOp>(std::move(setupOp), *this);
1304}
1305
1306// Performs all evaluations.
1307template <typename Op>
1308Bench& Bench::run(char const* benchmarkName, Op&& op) {
1309 name(benchmarkName);
1310 return run(std::forward<Op>(op));
1311}
1312
1313template <typename Op>
1314Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1315 name(benchmarkName);
1316 return run(std::forward<Op>(op));
1317}
1318
1319template <typename Op>
1320BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1321 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1322}
1323
1324template <typename Op>
1325BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1326 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1327}
1328
1329// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1330// Any argument is cast to double.
1331template <typename T>
1332Bench& Bench::batch(T b) noexcept {
1333 mConfig.mBatch = static_cast<double>(b);
1334 return *this;
1335}
1336
1337// Sets the computation complexity of the next run. Any argument is cast to double.
1338template <typename T>
1340 mConfig.mComplexityN = static_cast<double>(n);
1341 return *this;
1342}
1343
1344// Convenience: makes sure none of the given arguments are optimized away by the compiler.
1345template <typename Arg>
1347 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1348 return *this;
1349}
1350
1351// Makes sure none of the given arguments are optimized away by the compiler.
1352template <typename Arg>
1353void doNotOptimizeAway(Arg&& arg) {
1354 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1355}
1356
1357namespace detail {
1358
1359#if defined(_MSC_VER)
1360template <typename T>
1361void doNotOptimizeAway(T const& val) {
1362 doNotOptimizeAwaySink(&val);
1363}
1364
1365#endif
1366
1367} // namespace detail
1368} // namespace nanobench
1369} // namespace ankerl
1370
1371#if defined(ANKERL_NANOBENCH_IMPLEMENT)
1372
1374// implementation part - only visible in .cpp
1376
1377# include <algorithm> // sort, reverse
1378# include <cmath>
1379# include <compare>
1380# include <cstdlib> // getenv
1381# include <fstream> // ifstream to parse proc files
1382# include <functional>
1383# include <iomanip> // setw, setprecision
1384# include <iostream> // cout
1385# include <iterator>
1386# include <locale>
1387# include <numeric> // accumulate
1388# include <random> // random_device
1389# include <sstream> // to_s in Number
1390# include <stdexcept> // throw for rendering templates
1391# include <tuple> // std::tie
1392# if defined(__linux__)
1393# include <unistd.h> //sysconf
1394# endif
1395# if ANKERL_NANOBENCH(PERF_COUNTERS)
1396# include <map> // map
1397
1398# include <linux/perf_event.h>
1399# include <sys/ioctl.h>
1400# include <sys/syscall.h>
1401# include <sys/types.h>
1402# endif
1403
1404// declarations ///////////////////////////////////////////////////////////////////////////////////
1405
1406// As definitions follow these declarations within this
1407// header, IWYU considers some of them redundant and
1408// suggests removing them. Disable this IWYU behavior.
1409// IWYU pragma: begin_keep
1410
1411namespace ankerl {
1412namespace nanobench {
1413
1414// helper stuff that is only intended to be used internally
1415namespace detail {
1416
1417// formatting utilities
1418namespace fmt {
1419
1420class NumSep;
1421class StreamStateRestorer;
1422class Number;
1423class MarkDownColumn;
1424class MarkDownCode;
1425
1426} // namespace fmt
1427} // namespace detail
1428} // namespace nanobench
1429} // namespace ankerl
1430
1431// IWYU pragma: end_keep
1432
1433// definitions ////////////////////////////////////////////////////////////////////////////////////
1434
1435namespace ankerl {
1436namespace nanobench {
1437
1438uint64_t splitMix64(uint64_t& state) noexcept;
1439
1440namespace detail {
1441
1442// helpers to get double values
1443template <typename T>
1444inline double d(T t) noexcept {
1445 return static_cast<double>(t);
1446}
1447inline double d(Clock::duration duration) noexcept {
1448 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1449}
1450
1451// Calculates clock resolution once, and remembers the result
1452inline Clock::duration clockResolution() noexcept;
1453
1454} // namespace detail
1455
1456namespace templates {
1457
1458char const* csv() noexcept {
1459 return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1460{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1461{{/result}})DELIM";
1462}
1463
1464char const* htmlBoxplot() noexcept {
1465 return R"DELIM(<html>
1466
1467<head>
1468 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1469</head>
1470
1471<body>
1472 <div id="myDiv"></div>
1473 <script>
1474 var data = [
1475 {{#result}}{
1476 name: '{{name}}',
1477 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1478 },
1479 {{/result}}
1480 ];
1481 var title = '{{title}}';
1482
1483 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1484 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1485 </script>
1486</body>
1487
1488</html>)DELIM";
1489}
1490
1491char const* pyperf() noexcept {
1492 return R"DELIM({
1493 "benchmarks": [
1494 {
1495 "runs": [
1496 {
1497 "values": [
1498{{#measurement}} {{elapsed}}{{^-last}},
1499{{/last}}{{/measurement}}
1500 ]
1501 }
1502 ]
1503 }
1504 ],
1505 "metadata": {
1506 "loops": {{sum(iterations)}},
1507 "inner_loops": {{batch}},
1508 "name": "{{title}}",
1509 "unit": "second"
1510 },
1511 "version": "1.0"
1512})DELIM";
1513}
1514
1515char const* json() noexcept {
1516 return R"DELIM({
1517 "results": [
1518{{#result}} {
1519 "title": "{{title}}",
1520 "name": "{{name}}",
1521 "unit": "{{unit}}",
1522 "batch": {{batch}},
1523 "complexityN": {{complexityN}},
1524 "epochs": {{epochs}},
1525 "clockResolution": {{clockResolution}},
1526 "clockResolutionMultiple": {{clockResolutionMultiple}},
1527 "maxEpochTime": {{maxEpochTime}},
1528 "minEpochTime": {{minEpochTime}},
1529 "minEpochIterations": {{minEpochIterations}},
1530 "epochIterations": {{epochIterations}},
1531 "warmup": {{warmup}},
1532 "relative": {{relative}},
1533 "median(elapsed)": {{median(elapsed)}},
1534 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1535 "median(instructions)": {{median(instructions)}},
1536 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1537 "median(cpucycles)": {{median(cpucycles)}},
1538 "median(contextswitches)": {{median(contextswitches)}},
1539 "median(pagefaults)": {{median(pagefaults)}},
1540 "median(branchinstructions)": {{median(branchinstructions)}},
1541 "median(branchmisses)": {{median(branchmisses)}},
1542 "totalTime": {{sumProduct(iterations, elapsed)}},
1543 "measurements": [
1544{{#measurement}} {
1545 "iterations": {{iterations}},
1546 "elapsed": {{elapsed}},
1547 "pagefaults": {{pagefaults}},
1548 "cpucycles": {{cpucycles}},
1549 "contextswitches": {{contextswitches}},
1550 "instructions": {{instructions}},
1551 "branchinstructions": {{branchinstructions}},
1552 "branchmisses": {{branchmisses}}
1553 }{{^-last}},{{/-last}}
1554{{/measurement}} ]
1555 }{{^-last}},{{/-last}}
1556{{/result}} ]
1557})DELIM";
1558}
1559
1560ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1561struct Node {
1562 enum class Type { tag, content, section, inverted_section };
1563
1564 char const* begin;
1565 char const* end;
1566 std::vector<Node> children;
1567 Type type;
1568
1569 template <size_t N>
1570 // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1571 bool operator==(char const (&str)[N]) const noexcept {
1572 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
1573 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1574 }
1575};
1576ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1577
1578// NOLINTNEXTLINE(misc-no-recursion)
1579static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1580 std::vector<Node> nodes;
1581
1582 while (true) {
1583 auto const* begin = std::strstr(*tpl, "{{");
1584 auto const* end = begin;
1585 if (begin != nullptr) {
1586 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1587 begin += 2;
1588 end = std::strstr(begin, "}}");
1589 }
1590
1591 if (begin == nullptr || end == nullptr) {
1592 // nothing found, finish node
1593 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1594 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1595 return nodes;
1596 }
1597
1598 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1599 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1600
1601 // we found a tag
1602 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1603 *tpl = end + 2;
1604 switch (*begin) {
1605 case '/':
1606 // finished! bail out
1607 return nodes;
1608
1609 case '#':
1610 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1611 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1612 break;
1613
1614 case '^':
1615 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1616 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1617 break;
1618
1619 default:
1620 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1621 break;
1622 }
1623 }
1624}
1625
1626static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1627 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1628 bool const matchFirst = n == "-first";
1629 bool const matchLast = n == "-last";
1630 if (!matchFirst && !matchLast) {
1631 return false;
1632 }
1633
1634 bool doWrite = false;
1635 if (n.type == Node::Type::section) {
1636 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1637 } else if (n.type == Node::Type::inverted_section) {
1638 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1639 }
1640
1641 if (doWrite) {
1642 for (auto const& child : n.children) {
1643 if (child.type == Node::Type::content) {
1644 out.write(child.begin, std::distance(child.begin, child.end));
1645 }
1646 }
1647 }
1648 return true;
1649}
1650
1651static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1652 matchResult.clear();
1653 auto idxOpen = str.find('(');
1654 auto idxClose = str.find(')', idxOpen);
1655 if (idxClose == std::string::npos) {
1656 return false;
1657 }
1658
1659 matchResult.emplace_back(str.substr(0, idxOpen));
1660
1661 // split by comma
1662 matchResult.emplace_back();
1663 for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1664 if (str[i] == ' ' || str[i] == '\t') {
1665 // skip whitespace
1666 continue;
1667 }
1668 if (str[i] == ',') {
1669 // got a comma => new string
1670 matchResult.emplace_back();
1671 continue;
1672 }
1673 // no whitespace no comma, append
1674 matchResult.back() += str[i];
1675 }
1676 return true;
1677}
1678
1679static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1680 using detail::d;
1681
1682 if (n == "title") {
1683 out << config.mBenchmarkTitle;
1684 return true;
1685 }
1686 if (n == "name") {
1687 out << config.mBenchmarkName;
1688 return true;
1689 }
1690 if (n == "unit") {
1691 out << config.mUnit;
1692 return true;
1693 }
1694 if (n == "batch") {
1695 out << config.mBatch;
1696 return true;
1697 }
1698 if (n == "complexityN") {
1699 out << config.mComplexityN;
1700 return true;
1701 }
1702 if (n == "epochs") {
1703 out << config.mNumEpochs;
1704 return true;
1705 }
1706 if (n == "clockResolution") {
1707 out << d(detail::clockResolution());
1708 return true;
1709 }
1710 if (n == "clockResolutionMultiple") {
1711 out << config.mClockResolutionMultiple;
1712 return true;
1713 }
1714 if (n == "maxEpochTime") {
1715 out << d(config.mMaxEpochTime);
1716 return true;
1717 }
1718 if (n == "minEpochTime") {
1719 out << d(config.mMinEpochTime);
1720 return true;
1721 }
1722 if (n == "minEpochIterations") {
1723 out << config.mMinEpochIterations;
1724 return true;
1725 }
1726 if (n == "epochIterations") {
1727 out << config.mEpochIterations;
1728 return true;
1729 }
1730 if (n == "warmup") {
1731 out << config.mWarmup;
1732 return true;
1733 }
1734 if (n == "relative") {
1735 out << config.mIsRelative;
1736 return true;
1737 }
1738 return false;
1739}
1740
1741// NOLINTNEXTLINE(readability-function-cognitive-complexity)
1742static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1743 if (generateConfigTag(n, r.config(), out)) {
1744 return out;
1745 }
1746 // match e.g. "median(elapsed)"
1747 // g++ 4.8 doesn't implement std::regex :(
1748 // static std::regex const regOpArg1("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\‍)$");
1749 // std::cmatch matchResult;
1750 // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1751 std::vector<std::string> matchResult;
1752 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1753 if (matchResult.size() == 2) {
1754 if (matchResult[0] == "context") {
1755 return out << r.context(matchResult[1]);
1756 }
1757
1758 auto m = Result::fromString(matchResult[1]);
1759 if (m == Result::Measure::_size) {
1760 return out << 0.0;
1761 }
1762
1763 if (matchResult[0] == "median") {
1764 return out << r.median(m);
1765 }
1766 if (matchResult[0] == "average") {
1767 return out << r.average(m);
1768 }
1769 if (matchResult[0] == "medianAbsolutePercentError") {
1770 return out << r.medianAbsolutePercentError(m);
1771 }
1772 if (matchResult[0] == "sum") {
1773 return out << r.sum(m);
1774 }
1775 if (matchResult[0] == "minimum") {
1776 return out << r.minimum(m);
1777 }
1778 if (matchResult[0] == "maximum") {
1779 return out << r.maximum(m);
1780 }
1781 } else if (matchResult.size() == 3) {
1782 auto m1 = Result::fromString(matchResult[1]);
1783 auto m2 = Result::fromString(matchResult[2]);
1785 return out << 0.0;
1786 }
1787
1788 if (matchResult[0] == "sumProduct") {
1789 return out << r.sumProduct(m1, m2);
1790 }
1791 }
1792 }
1793
1794 // match e.g. "sumProduct(elapsed, iterations)"
1795 // static std::regex const regOpArg2("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\‍)$");
1796
1797 // nothing matches :(
1798 throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1799}
1800
1801static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1802 for (auto const& n : nodes) {
1803 if (!generateFirstLast(n, idx, r.size(), out)) {
1804 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1805 switch (n.type) {
1806 case Node::Type::content:
1807 out.write(n.begin, std::distance(n.begin, n.end));
1808 break;
1809
1810 case Node::Type::inverted_section:
1811 throw std::runtime_error("got a inverted section inside measurement");
1812
1813 case Node::Type::section:
1814 throw std::runtime_error("got a section inside measurement");
1815
1816 case Node::Type::tag: {
1817 auto m = Result::fromString(std::string(n.begin, n.end));
1818 if (m == Result::Measure::_size || !r.has(m)) {
1819 out << 0.0;
1820 } else {
1821 out << r.get(idx, m);
1822 }
1823 break;
1824 }
1825 }
1826 }
1827 }
1828}
1829
1830static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1831 auto const& r = results[idx];
1832 for (auto const& n : nodes) {
1833 if (!generateFirstLast(n, idx, results.size(), out)) {
1834 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1835 switch (n.type) {
1836 case Node::Type::content:
1837 out.write(n.begin, std::distance(n.begin, n.end));
1838 break;
1839
1840 case Node::Type::inverted_section:
1841 throw std::runtime_error("got a inverted section inside result");
1842
1843 case Node::Type::section:
1844 if (n == "measurement") {
1845 for (size_t i = 0; i < r.size(); ++i) {
1846 generateResultMeasurement(n.children, i, r, out);
1847 }
1848 } else {
1849 throw std::runtime_error("got a section inside result");
1850 }
1851 break;
1852
1853 case Node::Type::tag:
1854 generateResultTag(n, r, out);
1855 break;
1856 }
1857 }
1858 }
1859}
1860
1861} // namespace templates
1862
1863// helper stuff that only intended to be used internally
1864namespace detail {
1865
1866char const* getEnv(char const* name);
1867bool isEndlessRunning(std::string const& name);
1868bool isWarningsEnabled();
1869
1870template <typename T>
1871T parseFile(std::string const& filename, bool* fail);
1872
1873void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1874void printStabilityInformationOnce(std::ostream* outStream);
1875
1876// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1877uint64_t& singletonHeaderHash() noexcept;
1878
1879// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1880Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1881
1882// formatting utilities
1883namespace fmt {
1884
1885// adds thousands separator to numbers
1886ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1887class NumSep : public std::numpunct<char> {
1888public:
1889 explicit NumSep(char sep);
1890 char do_thousands_sep() const override;
1891 std::string do_grouping() const override;
1892
1893private:
1894 char mSep;
1895};
1896ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1897
1898// RAII to save & restore a stream's state
1899ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1900class StreamStateRestorer {
1901public:
1902 explicit StreamStateRestorer(std::ostream& s);
1903 ~StreamStateRestorer();
1904
1905 // sets back all stream info that we remembered at construction
1906 void restore();
1907
1908 // don't allow copying / moving
1909 StreamStateRestorer(StreamStateRestorer const&) = delete;
1910 StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1911 StreamStateRestorer(StreamStateRestorer&&) = delete;
1912 StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1913
1914private:
1915 std::ostream& mStream;
1916 std::locale mLocale;
1917 std::streamsize const mPrecision;
1918 std::streamsize const mWidth;
1919 std::ostream::char_type const mFill;
1920 std::ostream::fmtflags const mFmtFlags;
1921};
1922ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1923
1924// Number formatter
1925class Number {
1926public:
1927 Number(int width, int precision, double value);
1928 Number(int width, int precision, int64_t value);
1929 ANKERL_NANOBENCH(NODISCARD) std::string to_s() const;
1930
1931private:
1932 friend std::ostream& operator<<(std::ostream& os, Number const& n);
1933 std::ostream& write(std::ostream& os) const;
1934
1935 int mWidth;
1936 int mPrecision;
1937 double mValue;
1938};
1939
1940// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1941std::string to_s(uint64_t n);
1942
1943std::ostream& operator<<(std::ostream& os, Number const& n);
1944
1945class MarkDownColumn {
1946public:
1947 MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept;
1948 ANKERL_NANOBENCH(NODISCARD) std::string title() const;
1949 ANKERL_NANOBENCH(NODISCARD) std::string separator() const;
1950 ANKERL_NANOBENCH(NODISCARD) std::string invalid() const;
1951 ANKERL_NANOBENCH(NODISCARD) std::string value() const;
1952
1953private:
1954 int mWidth;
1955 int mPrecision;
1956 std::string mTitle;
1957 std::string mSuffix;
1958 double mValue;
1959};
1960
1961// Formats any text as markdown code, escaping backticks.
1962class MarkDownCode {
1963public:
1964 explicit MarkDownCode(std::string const& what);
1965
1966private:
1967 friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1968 std::ostream& write(std::ostream& os) const;
1969
1970 std::string mWhat{};
1971};
1972
1973std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1974
1975} // namespace fmt
1976} // namespace detail
1977} // namespace nanobench
1978} // namespace ankerl
1979
1980// implementation /////////////////////////////////////////////////////////////////////////////////
1981
1982namespace ankerl {
1983namespace nanobench {
1984
1985// NOLINTNEXTLINE(readability-function-cognitive-complexity)
1986void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1987 detail::fmt::StreamStateRestorer const restorer(out);
1988
1989 out.precision(std::numeric_limits<double>::digits10);
1990 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1991
1992 for (auto const& n : nodes) {
1993 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1994 switch (n.type) {
1995 case templates::Node::Type::content:
1996 out.write(n.begin, std::distance(n.begin, n.end));
1997 break;
1998
1999 case templates::Node::Type::inverted_section:
2000 throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
2001
2002 case templates::Node::Type::section:
2003 if (n == "result") {
2004 const size_t nbResults = results.size();
2005 for (size_t i = 0; i < nbResults; ++i) {
2006 generateResult(n.children, i, results, out);
2007 }
2008 } else if (n == "measurement") {
2009 if (results.size() != 1) {
2010 throw std::runtime_error(
2011 "render: can only use section 'measurement' here if there is a single result, but there are " +
2012 detail::fmt::to_s(results.size()));
2013 }
2014 // when we only have a single result, we can immediately go into its measurement.
2015 auto const& r = results.front();
2016 for (size_t i = 0; i < r.size(); ++i) {
2017 generateResultMeasurement(n.children, i, r, out);
2018 }
2019 } else {
2020 throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
2021 }
2022 break;
2023
2024 case templates::Node::Type::tag:
2025 if (results.size() == 1) {
2026 // result & config are both supported there
2027 generateResultTag(n, results.front(), out);
2028 } else {
2029 // This just uses the last result's config.
2030 if (!generateConfigTag(n, results.back().config(), out)) {
2031 throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
2032 }
2033 }
2034 break;
2035 }
2036 }
2037}
2038
2039void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
2040 render(mustacheTemplate.c_str(), results, out);
2041}
2042
2043void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
2044 render(mustacheTemplate, bench.results(), out);
2045}
2046
2047void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
2048 render(mustacheTemplate.c_str(), bench.results(), out);
2049}
2050
2051namespace detail {
2052
2053PerformanceCounters& performanceCounters() {
2054# if defined(__clang__)
2055# pragma clang diagnostic push
2056# pragma clang diagnostic ignored "-Wexit-time-destructors"
2057# endif
2058 static PerformanceCounters pc;
2059# if defined(__clang__)
2060# pragma clang diagnostic pop
2061# endif
2062 return pc;
2063}
2064
2065// Windows version of doNotOptimizeAway
2066// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L514
2067// see https://github.com/facebook/folly/blob/v2023.01.30.00/folly/lang/Hint-inl.h#L54-L58
2068// see https://learn.microsoft.com/en-us/cpp/preprocessor/optimize
2069# if defined(_MSC_VER)
2070# pragma optimize("", off)
2071void doNotOptimizeAwaySink(void const*) {}
2072# pragma optimize("", on)
2073# endif
2074
2075template <typename T>
2076T parseFile(std::string const& filename, bool* fail) {
2077 std::ifstream fin(filename); // NOLINT(misc-const-correctness)
2078 T num{};
2079 fin >> num;
2080 if (fail != nullptr) {
2081 *fail = fin.fail();
2082 }
2083 return num;
2084}
2085
2086char const* getEnv(char const* name) {
2087# if defined(_MSC_VER)
2088# pragma warning(push)
2089# pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
2090# endif
2091 return std::getenv(name); // NOLINT(concurrency-mt-unsafe)
2092# if defined(_MSC_VER)
2093# pragma warning(pop)
2094# endif
2095}
2096
2097bool isEndlessRunning(std::string const& name) {
2098 auto const* const endless = getEnv("NANOBENCH_ENDLESS");
2099 return nullptr != endless && endless == name;
2100}
2101
2102// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
2103bool isWarningsEnabled() {
2104 auto const* const suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
2105 return nullptr == suppression || suppression == std::string("0");
2106}
2107
2108void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
2109 warnings.clear();
2110 recommendations.clear();
2111
2112# if defined(DEBUG)
2113 warnings.emplace_back("DEBUG defined");
2114 bool const recommendCheckFlags = true;
2115# else
2116 bool const recommendCheckFlags = false;
2117# endif
2118
2119 bool recommendPyPerf = false;
2120# if defined(__linux__)
2121 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
2122 if (nprocs <= 0) {
2123 warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
2124 } else {
2125 // check frequency scaling
2126 for (long id = 0; id < nprocs; ++id) {
2127 auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
2128 auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
2129 auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq", nullptr);
2130 auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq", nullptr);
2131 if (minFreq != maxFreq) {
2132 auto minMHz = d(minFreq) / 1000.0;
2133 auto maxMHz = d(maxFreq) / 1000.0;
2134 warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
2135 detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
2136 " MHz");
2137 recommendPyPerf = true;
2138 break;
2139 }
2140 }
2141
2142 auto fail = false;
2143 auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", &fail);
2144 if (!fail && "performance" != currentGovernor) {
2145 warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
2146 recommendPyPerf = true;
2147 }
2148
2149 auto noTurbo = parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo", &fail);
2150 if (!fail && noTurbo == 0) {
2151 warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
2152 recommendPyPerf = true;
2153 }
2154 }
2155# endif
2156
2157 if (recommendCheckFlags) {
2158 recommendations.emplace_back("Make sure you compile for Release");
2159 }
2160 if (recommendPyPerf) {
2161 recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
2162 }
2163}
2164
2165void printStabilityInformationOnce(std::ostream* outStream) {
2166 static bool shouldPrint = true;
2167 if (shouldPrint && (nullptr != outStream) && isWarningsEnabled()) {
2168 auto& os = *outStream;
2169 shouldPrint = false;
2170 std::vector<std::string> warnings;
2171 std::vector<std::string> recommendations;
2172 gatherStabilityInformation(warnings, recommendations);
2173 if (warnings.empty()) {
2174 return;
2175 }
2176
2177 os << "Warning, results might be unstable:" << std::endl;
2178 for (auto const& w : warnings) {
2179 os << "* " << w << std::endl;
2180 }
2181
2182 os << std::endl << "Recommendations" << std::endl;
2183 for (auto const& r : recommendations) {
2184 os << "* " << r << std::endl;
2185 }
2186 }
2187}
2188
2189// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2190uint64_t& singletonHeaderHash() noexcept {
2191 static uint64_t sHeaderHash{};
2192 return sHeaderHash;
2193}
2194
2195ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2196inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2197 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2198}
2199
2200// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2201Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2202 auto bestDuration = Clock::duration::max();
2203 Clock::time_point tBegin;
2204 Clock::time_point tEnd;
2205 for (size_t i = 0; i < numEvaluations; ++i) {
2206 tBegin = Clock::now();
2207 do {
2208 tEnd = Clock::now();
2209 } while (tBegin == tEnd);
2210 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2211 }
2212 return bestDuration;
2213}
2214
2215// Calculates clock resolution once, and remembers the result
2216Clock::duration clockResolution() noexcept {
2217 static Clock::duration const sResolution = calcClockResolution(20);
2218 return sResolution;
2219}
2220
2221ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2222struct IterationLogic::Impl {
2223 enum class State { warmup, upscaling_runtime, measuring, endless };
2224
2225 explicit Impl(Bench const& bench)
2226 : mBench(bench)
2227 , mResult(bench.config()) {
2228 printStabilityInformationOnce(mBench.output());
2229
2230 // determine target runtime per epoch
2231 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2232 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2233 mTargetRuntimePerEpoch = mBench.maxEpochTime();
2234 }
2235 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2236 mTargetRuntimePerEpoch = mBench.minEpochTime();
2237 }
2238
2239 if (isEndlessRunning(mBench.name())) {
2240 std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2241 mNumIters = (std::numeric_limits<uint64_t>::max)();
2242 mState = State::endless;
2243 } else if (0 != mBench.warmup()) {
2244 mNumIters = mBench.warmup();
2245 mState = State::warmup;
2246 } else if (0 != mBench.epochIterations()) {
2247 // exact number of iterations
2248 mNumIters = mBench.epochIterations();
2249 mState = State::measuring;
2250 } else {
2251 mNumIters = mBench.minEpochIterations();
2252 mState = State::upscaling_runtime;
2253 }
2254 }
2255
2256 // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2257 ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2258 auto doubleElapsed = d(elapsed);
2259 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2260 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2261
2262 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2263 if (doubleNewIters < doubleMinEpochIters) {
2264 doubleNewIters = doubleMinEpochIters;
2265 }
2266 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2267
2268 // +0.5 for correct rounding when casting
2269 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2270 return static_cast<uint64_t>(doubleNewIters + 0.5);
2271 }
2272
2273 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
2274 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2275 // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2276 if (mNumIters * 10 < mNumIters) {
2277 // overflow :-(
2278 showResult("iterations overflow. Maybe your code got optimized away?");
2279 mNumIters = 0;
2280 return;
2281 }
2282 mNumIters *= 10;
2283 } else {
2284 mNumIters = calcBestNumIters(elapsed, mNumIters);
2285 }
2286 }
2287
2288 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2289# if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2290 auto oldIters = mNumIters;
2291# endif
2292
2293 switch (mState) {
2294 case State::warmup:
2295 if (isCloseEnoughForMeasurements(elapsed)) {
2296 // if elapsed is close enough, we can skip upscaling and go right to measurements
2297 // still, we don't add the result to the measurements.
2298 mState = State::measuring;
2299 mNumIters = calcBestNumIters(elapsed, mNumIters);
2300 } else {
2301 // not close enough: switch to upscaling
2302 mState = State::upscaling_runtime;
2303 upscale(elapsed);
2304 }
2305 break;
2306
2307 case State::upscaling_runtime:
2308 if (isCloseEnoughForMeasurements(elapsed)) {
2309 // if we are close enough, add measurement and switch to always measuring
2310 mState = State::measuring;
2311 mTotalElapsed += elapsed;
2312 mTotalNumIters += mNumIters;
2313 mResult.add(elapsed, mNumIters, pc);
2314 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2315 } else {
2316 upscale(elapsed);
2317 }
2318 break;
2319
2320 case State::measuring:
2321 // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2322 // that fluctuation, or else we would bias the result
2323 mTotalElapsed += elapsed;
2324 mTotalNumIters += mNumIters;
2325 mResult.add(elapsed, mNumIters, pc);
2326 if (0 != mBench.epochIterations()) {
2327 mNumIters = mBench.epochIterations();
2328 } else {
2329 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2330 }
2331 break;
2332
2333 case State::endless:
2334 mNumIters = (std::numeric_limits<uint64_t>::max)();
2335 break;
2336 }
2337
2338 if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2339 // we got all the results that we need, finish it
2340 showResult("");
2341 mNumIters = 0;
2342 }
2343
2344 ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, d(elapsed.count())) << " elapsed, "
2345 << detail::fmt::Number(20, 3, d(mTargetRuntimePerEpoch.count())) << " target. oldIters="
2346 << oldIters << ", mNumIters=" << mNumIters << ", mState=" << static_cast<int>(mState));
2347 }
2348
2349 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
2350 void showResult(std::string const& errorMessage) const {
2351 ANKERL_NANOBENCH_LOG(errorMessage);
2352
2353 if (mBench.output() != nullptr) {
2354 // prepare column data ///////
2355 std::vector<fmt::MarkDownColumn> columns;
2356
2357 auto rMedian = mResult.median(Result::Measure::elapsed);
2358
2359 if (mBench.relative()) {
2360 double d = 100.0;
2361 if (!mBench.results().empty()) {
2362 d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2363 }
2364 columns.emplace_back(11, 1, "relative", "%", d);
2365 }
2366
2367 if (mBench.complexityN() > 0) {
2368 columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2369 }
2370
2371 columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2372 rMedian / (mBench.timeUnit().count() * mBench.batch()));
2373 columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2374
2375 double const rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2376 columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2377
2378 double rInsMedian = -1.0;
2379 if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {
2380 rInsMedian = mResult.median(Result::Measure::instructions);
2381 columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2382 }
2383
2384 double rCycMedian = -1.0;
2385 if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {
2386 rCycMedian = mResult.median(Result::Measure::cpucycles);
2387 columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2388 }
2389 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2390 columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2391 }
2392 if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {
2393 double const rBraMedian = mResult.median(Result::Measure::branchinstructions);
2394 columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2395 if (mResult.has(Result::Measure::branchmisses)) {
2396 double p = 0.0;
2397 if (rBraMedian >= 1e-9) {
2398 p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2399 }
2400 columns.emplace_back(10, 1, "miss%", "%", p);
2401 }
2402 }
2403
2404 columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2405
2406 // write everything
2407 auto& os = *mBench.output();
2408
2409 // combine all elements that are relevant for printing the header
2410 uint64_t hash = 0;
2411 hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2412 hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2413 hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2414 hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2415 hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2416 hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2417
2418 if (hash != singletonHeaderHash()) {
2419 singletonHeaderHash() = hash;
2420
2421 // no result yet, print header
2422 os << std::endl;
2423 for (auto const& col : columns) {
2424 os << col.title();
2425 }
2426 os << "| " << mBench.title() << std::endl;
2427
2428 for (auto const& col : columns) {
2429 os << col.separator();
2430 }
2431 os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2432 }
2433
2434 if (!errorMessage.empty()) {
2435 for (auto const& col : columns) {
2436 os << col.invalid();
2437 }
2438 os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2439 } else {
2440 for (auto const& col : columns) {
2441 os << col.value();
2442 }
2443 os << "| ";
2444 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2445 if (showUnstable) {
2446 os << ":wavy_dash: ";
2447 }
2448 os << fmt::MarkDownCode(mBench.name());
2449 if (showUnstable) {
2450 auto avgIters = d(mTotalNumIters) / d(mBench.epochs());
2451 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2452 auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2453
2454 os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2455 << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2456 }
2457 os << std::endl;
2458 }
2459 }
2460 }
2461
2462 ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2463 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2464 }
2465
2466 uint64_t mNumIters = 1; // NOLINT(misc-non-private-member-variables-in-classes)
2467 Bench const& mBench; // NOLINT(misc-non-private-member-variables-in-classes)
2468 std::chrono::nanoseconds mTargetRuntimePerEpoch{}; // NOLINT(misc-non-private-member-variables-in-classes)
2469 Result mResult; // NOLINT(misc-non-private-member-variables-in-classes)
2470 Rng mRng{123}; // NOLINT(misc-non-private-member-variables-in-classes)
2471 std::chrono::nanoseconds mTotalElapsed{}; // NOLINT(misc-non-private-member-variables-in-classes)
2472 uint64_t mTotalNumIters = 0; // NOLINT(misc-non-private-member-variables-in-classes)
2473 State mState = State::upscaling_runtime; // NOLINT(misc-non-private-member-variables-in-classes)
2474};
2475ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2476
2477IterationLogic::IterationLogic(Bench const& bench)
2478 : mPimpl(new Impl(bench)) {}
2479
2480IterationLogic::~IterationLogic() {
2481 delete mPimpl;
2482}
2483
2484uint64_t IterationLogic::numIters() const noexcept {
2485 ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2486 return mPimpl->mNumIters;
2487}
2488
2489void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2490 mPimpl->add(elapsed, pc);
2491}
2492
2493void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2494 results.emplace_back(std::move(mPimpl->mResult));
2495}
2496
2497# if ANKERL_NANOBENCH(PERF_COUNTERS)
2498
2499ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2500class LinuxPerformanceCounters {
2501public:
2502 struct Target {
2503 Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2504 : targetValue(targetValue_)
2505 , correctMeasuringOverhead(correctMeasuringOverhead_)
2506 , correctLoopOverhead(correctLoopOverhead_) {}
2507
2508 uint64_t* targetValue{}; // NOLINT(misc-non-private-member-variables-in-classes)
2509 bool correctMeasuringOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2510 bool correctLoopOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2511 };
2512
2513 LinuxPerformanceCounters() = default;
2514 LinuxPerformanceCounters(LinuxPerformanceCounters const&) = delete;
2515 LinuxPerformanceCounters(LinuxPerformanceCounters&&) = delete;
2516 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters const&) = delete;
2517 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters&&) = delete;
2518 ~LinuxPerformanceCounters();
2519
2520 // quick operation
2521 inline void start() {}
2522
2523 inline void stop() {}
2524
2525 bool monitor(perf_sw_ids swId, Target target);
2526 bool monitor(perf_hw_id hwId, Target target);
2527
2528 ANKERL_NANOBENCH(NODISCARD) bool hasError() const noexcept {
2529 return mHasError;
2530 }
2531
2532 // Just reading data is faster than enable & disabling.
2533 // we subtract data ourselves.
2534 inline void beginMeasure() {
2535 if (mHasError) {
2536 return;
2537 }
2538
2539 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2540 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2541 if (mHasError) {
2542 return;
2543 }
2544
2545 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2546 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2547 }
2548
2549 inline void endMeasure() {
2550 if (mHasError) {
2551 return;
2552 }
2553
2554 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2555 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2556 if (mHasError) {
2557 return;
2558 }
2559
2560 auto const numBytes = sizeof(uint64_t) * mCounters.size();
2561 auto ret = read(mFd, mCounters.data(), numBytes);
2562 mHasError = ret != static_cast<ssize_t>(numBytes);
2563 }
2564
2565 void updateResults(uint64_t numIters);
2566
2567 // rounded integer division
2568 template <typename T>
2569 static inline T divRounded(T a, T divisor) {
2570 return (a + divisor / 2) / divisor;
2571 }
2572
2573 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2574 static inline uint32_t mix(uint32_t x) noexcept {
2575 x ^= x << 13U;
2576 x ^= x >> 17U;
2577 x ^= x << 5U;
2578 return x;
2579 }
2580
2581 template <typename Op>
2582 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2583 void calibrate(Op&& op) {
2584 // clear current calibration data,
2585 for (auto& v : mCalibratedOverhead) {
2586 v = UINT64_C(0);
2587 }
2588
2589 // create new calibration data
2590 auto newCalibration = mCalibratedOverhead;
2591 for (auto& v : newCalibration) {
2592 v = (std::numeric_limits<uint64_t>::max)();
2593 }
2594 for (size_t iter = 0; iter < 100; ++iter) {
2595 beginMeasure();
2596 op();
2597 endMeasure();
2598 if (mHasError) {
2599 return;
2600 }
2601
2602 for (size_t i = 0; i < newCalibration.size(); ++i) {
2603 auto diff = mCounters[i];
2604 if (newCalibration[i] > diff) {
2605 newCalibration[i] = diff;
2606 }
2607 }
2608 }
2609
2610 mCalibratedOverhead = std::move(newCalibration);
2611
2612 {
2613 // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2614 // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2615 // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2616 // see https://godbolt.org/z/49RVQ5
2617 uint64_t const numIters = 100000U + (std::random_device{}() & 3U);
2618 uint64_t n = numIters;
2619 uint32_t x = 1234567;
2620
2621 beginMeasure();
2622 while (n-- > 0) {
2623 x = mix(x);
2624 }
2625 endMeasure();
2627 auto measure1 = mCounters;
2628
2629 n = numIters;
2630 beginMeasure();
2631 while (n-- > 0) {
2632 // we now run *twice* so we can easily calculate the overhead
2633 x = mix(x);
2634 x = mix(x);
2635 }
2636 endMeasure();
2638 auto measure2 = mCounters;
2639
2640 for (size_t i = 0; i < mCounters.size(); ++i) {
2641 // factor 2 because we have two instructions per loop
2642 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2643 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2644 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2645
2646 mLoopOverhead[i] = divRounded(overhead, numIters);
2647 }
2648 }
2649 }
2650
2651private:
2652 bool monitor(uint32_t type, uint64_t eventid, Target target);
2653
2654 std::map<uint64_t, Target> mIdToTarget{};
2655
2656 // start with minimum size of 3 for read_format
2657 std::vector<uint64_t> mCounters{3};
2658 std::vector<uint64_t> mCalibratedOverhead{3};
2659 std::vector<uint64_t> mLoopOverhead{3};
2660
2661 uint64_t mTimeEnabledNanos = 0;
2662 uint64_t mTimeRunningNanos = 0;
2663 int mFd = -1;
2664 bool mHasError = false;
2665};
2666ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2667
2668LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2669 if (-1 != mFd) {
2670 close(mFd);
2671 }
2672}
2673
2674bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2675 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2676}
2677
2678bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2679 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2680}
2681
2682// overflow is ok, it's checked
2683ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2684void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2685 // clear old data
2686 for (auto& id_value : mIdToTarget) {
2687 *id_value.second.targetValue = UINT64_C(0);
2688 }
2689
2690 if (mHasError) {
2691 return;
2692 }
2693
2694 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2695 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2696
2697 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2698 auto idx = static_cast<size_t>(3 + i * 2 + 0);
2699 auto id = mCounters[idx + 1U];
2700
2701 auto it = mIdToTarget.find(id);
2702 if (it != mIdToTarget.end()) {
2703
2704 auto& tgt = it->second;
2705 *tgt.targetValue = mCounters[idx];
2706 if (tgt.correctMeasuringOverhead) {
2707 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2708 *tgt.targetValue -= mCalibratedOverhead[idx];
2709 } else {
2710 *tgt.targetValue = 0U;
2711 }
2712 }
2713 if (tgt.correctLoopOverhead) {
2714 auto correctionVal = mLoopOverhead[idx] * numIters;
2715 if (*tgt.targetValue >= correctionVal) {
2716 *tgt.targetValue -= correctionVal;
2717 } else {
2718 *tgt.targetValue = 0U;
2719 }
2720 }
2721 }
2722 }
2723}
2724
2725bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2726 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2727 if (mHasError) {
2728 return false;
2729 }
2730
2731 auto pea = perf_event_attr();
2732 std::memset(&pea, 0, sizeof(perf_event_attr));
2733 pea.type = type;
2734 pea.size = sizeof(perf_event_attr);
2735 pea.config = eventid;
2736 pea.disabled = 1; // start counter as disabled
2737 pea.exclude_kernel = 1;
2738 pea.exclude_hv = 1;
2739
2740 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2741 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2742
2743 const int pid = 0; // the current process
2744 const int cpu = -1; // all CPUs
2745# if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2746 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2747# else
2748 const unsigned long flags = 0;
2749# endif
2750
2751 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
2752 auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2753 if (-1 == fd) {
2754 return false;
2755 }
2756 if (-1 == mFd) {
2757 // first call: set to fd, and use this from now on
2758 mFd = fd;
2759 }
2760 uint64_t id = 0;
2761 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2762 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2763 // couldn't get id
2764 return false;
2765 }
2766
2767 // insert into map, rely on the fact that map's references are constant.
2768 mIdToTarget.emplace(id, target);
2769
2770 // prepare readformat with the correct size (after the insert)
2771 auto size = 3 + 2 * mIdToTarget.size();
2772 mCounters.resize(size);
2773 mCalibratedOverhead.resize(size);
2774 mLoopOverhead.resize(size);
2775
2776 return true;
2777}
2778
2779PerformanceCounters::PerformanceCounters()
2780 : mPc(new LinuxPerformanceCounters())
2781 , mVal()
2782 , mHas() {
2783
2784 // HW events
2785 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2786 if (!mHas.cpuCycles) {
2787 // Fallback to cycles counter, reference cycles not available in many systems.
2788 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2789 }
2790 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2791 mHas.branchInstructions =
2792 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2793 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2794 // mHas.branchMisses = false;
2795
2796 // SW events
2797 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2798 mHas.contextSwitches =
2799 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2800
2801 mPc->start();
2802 mPc->calibrate([] {
2803 auto before = ankerl::nanobench::Clock::now();
2804 auto after = ankerl::nanobench::Clock::now();
2805 (void)before;
2806 (void)after;
2807 });
2808
2809 if (mPc->hasError()) {
2810 // something failed, don't monitor anything.
2811 mHas = PerfCountSet<bool>{};
2812 }
2813}
2814
2815PerformanceCounters::~PerformanceCounters() {
2816 // no need to check for nullptr, delete nullptr has no effect
2817 delete mPc;
2818}
2819
2820void PerformanceCounters::beginMeasure() {
2821 mPc->beginMeasure();
2822}
2823
2824void PerformanceCounters::endMeasure() {
2825 mPc->endMeasure();
2826}
2827
2828void PerformanceCounters::updateResults(uint64_t numIters) {
2829 mPc->updateResults(numIters);
2830}
2831
2832# else
2833
2834PerformanceCounters::PerformanceCounters() = default;
2835PerformanceCounters::~PerformanceCounters() = default;
2836void PerformanceCounters::beginMeasure() {}
2837void PerformanceCounters::endMeasure() {}
2838void PerformanceCounters::updateResults(uint64_t) {}
2839
2840# endif
2841
2842ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2843 return mVal;
2844}
2845ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2846 return mHas;
2847}
2848
2849// formatting utilities
2850namespace fmt {
2851
2852// adds thousands separator to numbers
2853NumSep::NumSep(char sep)
2854 : mSep(sep) {}
2855
2856char NumSep::do_thousands_sep() const {
2857 return mSep;
2858}
2859
2860std::string NumSep::do_grouping() const {
2861 return "\003";
2862}
2863
2864// RAII to save & restore a stream's state
2865StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2866 : mStream(s)
2867 , mLocale(s.getloc())
2868 , mPrecision(s.precision())
2869 , mWidth(s.width())
2870 , mFill(s.fill())
2871 , mFmtFlags(s.flags()) {}
2872
2873StreamStateRestorer::~StreamStateRestorer() {
2874 restore();
2875}
2876
2877// sets back all stream info that we remembered at construction
2878void StreamStateRestorer::restore() {
2879 mStream.imbue(mLocale);
2880 mStream.precision(mPrecision);
2881 mStream.width(mWidth);
2882 mStream.fill(mFill);
2883 mStream.flags(mFmtFlags);
2884}
2885
2886Number::Number(int width, int precision, int64_t value)
2887 : mWidth(width)
2888 , mPrecision(precision)
2889 , mValue(d(value)) {}
2890
2891Number::Number(int width, int precision, double value)
2892 : mWidth(width)
2893 , mPrecision(precision)
2894 , mValue(value) {}
2895
2896std::ostream& Number::write(std::ostream& os) const {
2897 StreamStateRestorer const restorer(os);
2898 os.imbue(std::locale(os.getloc(), new NumSep(',')));
2899 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2900 return os;
2901}
2902
2903std::string Number::to_s() const {
2904 std::stringstream ss;
2905 write(ss);
2906 return ss.str();
2907}
2908
2909std::string to_s(uint64_t n) {
2910 std::string str;
2911 do {
2912 str += static_cast<char>('0' + static_cast<char>(n % 10));
2913 n /= 10;
2914 } while (n != 0);
2915 std::reverse(str.begin(), str.end());
2916 return str;
2917}
2918
2919std::ostream& operator<<(std::ostream& os, Number const& n) {
2920 return n.write(os);
2921}
2922
2923MarkDownColumn::MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept
2924 : mWidth(w)
2925 , mPrecision(prec)
2926 , mTitle(std::move(tit))
2927 , mSuffix(std::move(suff))
2928 , mValue(val) {}
2929
2930std::string MarkDownColumn::title() const {
2931 std::stringstream ss;
2932 ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2933 return ss.str();
2934}
2935
2936std::string MarkDownColumn::separator() const {
2937 std::string sep(static_cast<size_t>(mWidth), '-');
2938 sep.front() = '|';
2939 sep.back() = ':';
2940 return sep;
2941}
2942
2943std::string MarkDownColumn::invalid() const {
2944 std::string sep(static_cast<size_t>(mWidth), ' ');
2945 sep.front() = '|';
2946 sep[sep.size() - 2] = '-';
2947 return sep;
2948}
2949
2950std::string MarkDownColumn::value() const {
2951 std::stringstream ss;
2952 auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2953 ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2954 return ss.str();
2955}
2956
2957// Formats any text as markdown code, escaping backticks.
2958MarkDownCode::MarkDownCode(std::string const& what) {
2959 mWhat.reserve(what.size() + 2);
2960 mWhat.push_back('`');
2961 for (char const c : what) {
2962 mWhat.push_back(c);
2963 if ('`' == c) {
2964 mWhat.push_back('`');
2965 }
2966 }
2967 mWhat.push_back('`');
2968}
2969
2970std::ostream& MarkDownCode::write(std::ostream& os) const {
2971 return os << mWhat;
2972}
2973
2974std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2975 return mdCode.write(os);
2976}
2977} // namespace fmt
2978} // namespace detail
2979
2980// provide implementation here so it's only generated once
2981Config::Config() = default;
2982Config::~Config() = default;
2983Config& Config::operator=(Config const&) = default;
2984Config& Config::operator=(Config&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
2985Config::Config(Config const&) = default;
2986Config::Config(Config&&) noexcept = default;
2987
2988// provide implementation here so it's only generated once
2989Result::~Result() = default;
2990Result& Result::operator=(Result const&) = default;
2991Result& Result::operator=(Result&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
2992Result::Result(Result const&) = default;
2993Result::Result(Result&&) noexcept = default;
2994
2995namespace detail {
2996template <typename T>
2997inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2998 return static_cast<typename std::underlying_type<T>::type>(val);
2999}
3000} // namespace detail
3001
3002// Result returned after a benchmark has finished. Can be used as a baseline for relative().
3003Result::Result(Config benchmarkConfig)
3004 : mConfig(std::move(benchmarkConfig))
3005 , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
3006
3007void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
3008 using detail::d;
3009 using detail::u;
3010
3011 double const dIters = d(iters);
3012 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
3013
3014 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
3015 if (pc.has().pageFaults) {
3016 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
3017 }
3018 if (pc.has().cpuCycles) {
3019 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
3020 }
3021 if (pc.has().contextSwitches) {
3022 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
3023 }
3024 if (pc.has().instructions) {
3025 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
3026 }
3027 if (pc.has().branchInstructions) {
3028 double branchInstructions = 0.0;
3029 // correcting branches: remove branch introduced by the while (...) loop for each iteration.
3030 if (pc.val().branchInstructions > iters + 1U) {
3031 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
3032 }
3033 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
3034
3035 if (pc.has().branchMisses) {
3036 // correcting branch misses
3037 double branchMisses = d(pc.val().branchMisses);
3038 if (branchMisses > branchInstructions) {
3039 // can't have branch misses when there were branches...
3040 branchMisses = branchInstructions;
3041 }
3042
3043 // assuming at least one missed branch for the loop
3044 branchMisses -= 1.0;
3045 if (branchMisses < 1.0) {
3046 branchMisses = 1.0;
3047 }
3048 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
3049 }
3050 }
3051}
3052
3053Config const& Result::config() const noexcept {
3054 return mConfig;
3055}
3056
3057inline double calcMedian(std::vector<double>& data) {
3058 if (data.empty()) {
3059 return 0.0;
3060 }
3061 std::sort(data.begin(), data.end());
3062
3063 auto midIdx = data.size() / 2U;
3064 if (1U == (data.size() & 1U)) {
3065 return data[midIdx];
3066 }
3067 return (data[midIdx - 1U] + data[midIdx]) / 2U;
3068}
3069
3070double Result::median(Measure m) const {
3071 // create a copy so we can sort
3072 auto data = mNameToMeasurements[detail::u(m)];
3073 return calcMedian(data);
3074}
3075
3076double Result::average(Measure m) const {
3077 using detail::d;
3078 auto const& data = mNameToMeasurements[detail::u(m)];
3079 if (data.empty()) {
3080 return 0.0;
3081 }
3082
3083 // create a copy so we can sort
3084 return sum(m) / d(data.size());
3085}
3086
3087double Result::medianAbsolutePercentError(Measure m) const {
3088 // create copy
3089 auto data = mNameToMeasurements[detail::u(m)];
3090
3091 // calculates MdAPE which is the median of percentage error
3092 // see https://support.numxl.com/hc/en-us/articles/115001223503-MdAPE-Median-Absolute-Percentage-Error
3093 auto med = calcMedian(data);
3094
3095 // transform the data to absolute error
3096 for (auto& x : data) {
3097 x = (x - med) / x;
3098 if (x < 0) {
3099 x = -x;
3100 }
3101 }
3102 return calcMedian(data);
3103}
3104
3105double Result::sum(Measure m) const noexcept {
3106 auto const& data = mNameToMeasurements[detail::u(m)];
3107 return std::accumulate(data.begin(), data.end(), 0.0);
3108}
3109
3110double Result::sumProduct(Measure m1, Measure m2) const noexcept {
3111 auto const& data1 = mNameToMeasurements[detail::u(m1)];
3112 auto const& data2 = mNameToMeasurements[detail::u(m2)];
3113
3114 if (data1.size() != data2.size()) {
3115 return 0.0;
3116 }
3117
3118 double result = 0.0;
3119 for (size_t i = 0, s = data1.size(); i != s; ++i) {
3120 result += data1[i] * data2[i];
3121 }
3122 return result;
3123}
3124
3125bool Result::has(Measure m) const noexcept {
3126 return !mNameToMeasurements[detail::u(m)].empty();
3127}
3128
3129double Result::get(size_t idx, Measure m) const {
3130 auto const& data = mNameToMeasurements[detail::u(m)];
3131 return data.at(idx);
3132}
3133
3134bool Result::empty() const noexcept {
3135 return 0U == size();
3136}
3137
3138size_t Result::size() const noexcept {
3139 auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
3140 return data.size();
3141}
3142
3143double Result::minimum(Measure m) const noexcept {
3144 auto const& data = mNameToMeasurements[detail::u(m)];
3145 if (data.empty()) {
3146 return 0.0;
3147 }
3148
3149 // here its save to assume that at least one element is there
3150 return *std::min_element(data.begin(), data.end());
3151}
3152
3153double Result::maximum(Measure m) const noexcept {
3154 auto const& data = mNameToMeasurements[detail::u(m)];
3155 if (data.empty()) {
3156 return 0.0;
3157 }
3158
3159 // here its save to assume that at least one element is there
3160 return *std::max_element(data.begin(), data.end());
3161}
3162
3163std::string const& Result::context(char const* variableName) const {
3164 return mConfig.mContext.at(variableName);
3165}
3166
3167std::string const& Result::context(std::string const& variableName) const {
3168 return mConfig.mContext.at(variableName);
3169}
3170
3171Result::Measure Result::fromString(std::string const& str) {
3172 if (str == "elapsed") {
3173 return Measure::elapsed;
3174 }
3175 if (str == "iterations") {
3176 return Measure::iterations;
3177 }
3178 if (str == "pagefaults") {
3179 return Measure::pagefaults;
3180 }
3181 if (str == "cpucycles") {
3182 return Measure::cpucycles;
3183 }
3184 if (str == "contextswitches") {
3185 return Measure::contextswitches;
3186 }
3187 if (str == "instructions") {
3188 return Measure::instructions;
3189 }
3190 if (str == "branchinstructions") {
3191 return Measure::branchinstructions;
3192 }
3193 if (str == "branchmisses") {
3194 return Measure::branchmisses;
3195 }
3196 // not found, return _size
3197 return Measure::_size;
3198}
3199
3200// Configuration of a microbenchmark.
3201Bench::Bench() {
3202 mConfig.mOut = &std::cout;
3203}
3204
3205Bench::Bench(Bench&&) noexcept = default;
3206Bench& Bench::operator=(Bench&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
3207Bench::Bench(Bench const&) = default;
3208Bench& Bench::operator=(Bench const&) = default;
3209Bench::~Bench() noexcept = default;
3210
3211double Bench::batch() const noexcept {
3212 return mConfig.mBatch;
3213}
3214
3215double Bench::complexityN() const noexcept {
3216 return mConfig.mComplexityN;
3217}
3218
3219// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3220// means it is slower than the baseline.
3221Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3222 mConfig.mIsRelative = isRelativeEnabled;
3223 return *this;
3224}
3225bool Bench::relative() const noexcept {
3226 return mConfig.mIsRelative;
3227}
3228
3229Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3230 mConfig.mShowPerformanceCounters = showPerformanceCounters;
3231 return *this;
3232}
3233bool Bench::performanceCounters() const noexcept {
3234 return mConfig.mShowPerformanceCounters;
3235}
3236
3237// Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3238// If u differs from currently set unit, the stored results will be cleared.
3239// Use singular (byte, not bytes).
3240Bench& Bench::unit(char const* u) {
3241 if (u != mConfig.mUnit) {
3242 mResults.clear();
3243 }
3244 mConfig.mUnit = u;
3245 return *this;
3246}
3247
3248Bench& Bench::unit(std::string const& u) {
3249 return unit(u.c_str());
3250}
3251
3252std::string const& Bench::unit() const noexcept {
3253 return mConfig.mUnit;
3254}
3255
3256Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3257 mConfig.mTimeUnit = tu;
3258 mConfig.mTimeUnitName = tuName;
3259 return *this;
3260}
3261
3262std::string const& Bench::timeUnitName() const noexcept {
3263 return mConfig.mTimeUnitName;
3264}
3265
3266std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3267 return mConfig.mTimeUnit;
3268}
3269
3270// If benchmarkTitle differs from currently set title, the stored results will be cleared.
3271Bench& Bench::title(const char* benchmarkTitle) {
3272 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3273 mResults.clear();
3274 }
3275 mConfig.mBenchmarkTitle = benchmarkTitle;
3276 return *this;
3277}
3278Bench& Bench::title(std::string const& benchmarkTitle) {
3279 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3280 mResults.clear();
3281 }
3282 mConfig.mBenchmarkTitle = benchmarkTitle;
3283 return *this;
3284}
3285
3286std::string const& Bench::title() const noexcept {
3287 return mConfig.mBenchmarkTitle;
3288}
3289
3290Bench& Bench::name(const char* benchmarkName) {
3291 mConfig.mBenchmarkName = benchmarkName;
3292 return *this;
3293}
3294
3295Bench& Bench::name(std::string const& benchmarkName) {
3296 mConfig.mBenchmarkName = benchmarkName;
3297 return *this;
3298}
3299
3300std::string const& Bench::name() const noexcept {
3301 return mConfig.mBenchmarkName;
3302}
3303
3304Bench& Bench::context(char const* variableName, char const* variableValue) {
3305 mConfig.mContext[variableName] = variableValue;
3306 return *this;
3307}
3308
3309Bench& Bench::context(std::string const& variableName, std::string const& variableValue) {
3310 mConfig.mContext[variableName] = variableValue;
3311 return *this;
3312}
3313
3314Bench& Bench::clearContext() {
3315 mConfig.mContext.clear();
3316 return *this;
3317}
3318
3319// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
3320Bench& Bench::epochs(size_t numEpochs) noexcept {
3321 mConfig.mNumEpochs = numEpochs;
3322 return *this;
3323}
3324size_t Bench::epochs() const noexcept {
3325 return mConfig.mNumEpochs;
3326}
3327
3328// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
3329Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3330 mConfig.mClockResolutionMultiple = multiple;
3331 return *this;
3332}
3333size_t Bench::clockResolutionMultiple() const noexcept {
3334 return mConfig.mClockResolutionMultiple;
3335}
3336
3337// Sets the maximum time each epoch should take. Default is 100ms.
3338Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3339 mConfig.mMaxEpochTime = t;
3340 return *this;
3341}
3342std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3343 return mConfig.mMaxEpochTime;
3344}
3345
3346// Sets the maximum time each epoch should take. Default is 100ms.
3347Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3348 mConfig.mMinEpochTime = t;
3349 return *this;
3350}
3351std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3352 return mConfig.mMinEpochTime;
3353}
3354
3355Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3356 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3357 return *this;
3358}
3359uint64_t Bench::minEpochIterations() const noexcept {
3360 return mConfig.mMinEpochIterations;
3361}
3362
3363Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3364 mConfig.mEpochIterations = numIters;
3365 return *this;
3366}
3367uint64_t Bench::epochIterations() const noexcept {
3368 return mConfig.mEpochIterations;
3369}
3370
3371Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3372 mConfig.mWarmup = numWarmupIters;
3373 return *this;
3374}
3375uint64_t Bench::warmup() const noexcept {
3376 return mConfig.mWarmup;
3377}
3378
3379Bench& Bench::config(Config const& benchmarkConfig) {
3380 mConfig = benchmarkConfig;
3381 return *this;
3382}
3383Config const& Bench::config() const noexcept {
3384 return mConfig;
3385}
3386
3387Bench& Bench::output(std::ostream* outstream) noexcept {
3388 mConfig.mOut = outstream;
3389 return *this;
3390}
3391
3392ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3393 return mConfig.mOut;
3394}
3395
3396std::vector<Result> const& Bench::results() const noexcept {
3397 return mResults;
3398}
3399
3400Bench& Bench::render(char const* templateContent, std::ostream& os) {
3401 ::ankerl::nanobench::render(templateContent, *this, os);
3402 return *this;
3403}
3404
3405Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3406 ::ankerl::nanobench::render(templateContent, *this, os);
3407 return *this;
3408}
3409
3410std::vector<BigO> Bench::complexityBigO() const {
3411 std::vector<BigO> bigOs;
3412 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3413 bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3414 return 1.0;
3415 });
3416 bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3417 return n;
3418 });
3419 bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3420 return std::log2(n);
3421 });
3422 bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3423 return n * std::log2(n);
3424 });
3425 bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3426 return n * n;
3427 });
3428 bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3429 return n * n * n;
3430 });
3431 std::sort(bigOs.begin(), bigOs.end());
3432 return bigOs;
3433}
3434
3435Rng::Rng()
3436 : mX(0)
3437 , mY(0) {
3438 std::random_device rd;
3439 std::uniform_int_distribution<uint64_t> dist;
3440 do {
3441 mX = dist(rd);
3442 mY = dist(rd);
3443 } while (mX == 0 && mY == 0);
3444}
3445
3446ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
3447uint64_t splitMix64(uint64_t& state) noexcept {
3448 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3449 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3450 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3451 return z ^ (z >> 31U);
3452}
3453
3454// Seeded as described in romu paper (update april 2020)
3455Rng::Rng(uint64_t seed) noexcept
3456 : mX(splitMix64(seed))
3457 , mY(splitMix64(seed)) {
3458 for (size_t i = 0; i < 10; ++i) {
3459 operator()();
3460 }
3461}
3462
3463// only internally used to copy the RNG.
3464Rng::Rng(uint64_t x, uint64_t y) noexcept
3465 : mX(x)
3466 , mY(y) {}
3467
3468Rng Rng::copy() const noexcept {
3469 return Rng{mX, mY};
3470}
3471
3472Rng::Rng(std::vector<uint64_t> const& data)
3473 : mX(0)
3474 , mY(0) {
3475 if (data.size() != 2) {
3476 throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3477 detail::fmt::to_s(data.size()));
3478 }
3479 mX = data[0];
3480 mY = data[1];
3481}
3482
3483std::vector<uint64_t> Rng::state() const {
3484 std::vector<uint64_t> data(2);
3485 data[0] = mX;
3486 data[1] = mY;
3487 return data;
3488}
3489
3490BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3491 BigO::RangeMeasure rangeMeasure;
3492 for (auto const& result : results) {
3493 if (result.config().mComplexityN > 0.0) {
3494 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3495 }
3496 }
3497 return rangeMeasure;
3498}
3499
3500BigO::BigO(std::string bigOName, RangeMeasure const& rangeMeasure)
3501 : mName(std::move(bigOName)) {
3502
3503 // estimate the constant factor
3504 double sumRangeMeasure = 0.0;
3505 double sumRangeRange = 0.0;
3506
3507 for (const auto& rm : rangeMeasure) {
3508 sumRangeMeasure += rm.first * rm.second;
3509 sumRangeRange += rm.first * rm.first;
3510 }
3511 mConstant = sumRangeMeasure / sumRangeRange;
3512
3513 // calculate root mean square
3514 double err = 0.0;
3515 double sumMeasure = 0.0;
3516 for (const auto& rm : rangeMeasure) {
3517 auto diff = mConstant * rm.first - rm.second;
3518 err += diff * diff;
3519
3520 sumMeasure += rm.second;
3521 }
3522
3523 auto n = detail::d(rangeMeasure.size());
3524 auto mean = sumMeasure / n;
3525 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3526}
3527
3528BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3529 : BigO(std::string(bigOName), rangeMeasure) {}
3530
3531std::string const& BigO::name() const noexcept {
3532 return mName;
3533}
3534
3535double BigO::constant() const noexcept {
3536 return mConstant;
3537}
3538
3539double BigO::normalizedRootMeanSquare() const noexcept {
3540 return mNormalizedRootMeanSquare;
3541}
3542
3543bool BigO::operator<(BigO const& other) const noexcept {
3544 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3545}
3546
3547std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3548 return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3549}
3550
3551std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3552 detail::fmt::StreamStateRestorer const restorer(os);
3553 os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3554 for (auto const& bigO : bigOs) {
3555 os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3556 os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3557 os << "| " << bigO.name();
3558 os << std::endl;
3559 }
3560 return os;
3561}
3562
3563} // namespace nanobench
3564} // namespace ankerl
3565
3566#endif // ANKERL_NANOBENCH_IMPLEMENT
3567#endif // ANKERL_NANOBENCH_H_INCLUDED
int ret
int flags
Definition: bitcoin-tx.cpp:530
Main entry point to nanobench's benchmarking facility.
Definition: nanobench.h:649
Bench & operator=(Bench const &other)
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1308
Bench & batch(T b) noexcept
Sets the batch size.
Definition: nanobench.h:1332
std::vector< BigO > complexityBigO() const
Bench()
Creates a new benchmark for configuration and running of benchmarks.
Bench & operator=(Bench &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
detail::SetupRunner< SetupOp > setup(SetupOp setupOp)
Configure an untimed setup step per epoch (forces single-iteration epochs).
Definition: nanobench.h:1302
Bench(Bench &&other) noexcept
Bench(Bench const &other)
Bench & complexityN(T n) noexcept
Definition: nanobench.h:1339
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1152
BigO(std::string bigOName, RangeMeasure const &scaledRangeMeasure)
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1149
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1162
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
BigO(std::string bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1166
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)
Result(Config benchmarkConfig)
static Measure fromString(std::string const &str)
void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const &pc)
Result(Result &&other) noexcept
ANKERL_NANOBENCH(NODISCARD) Config const &config() const noexcept
Result & operator=(Result const &other)
Result(Result const &other)
Result & operator=(Result &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
An extremely fast random generator.
Definition: nanobench.h:510
static constexpr uint64_t() min()
Rng(Rng const &)=delete
As a safety precaution, we don't allow copying.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1227
Rng(Rng &&) noexcept=default
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.
static constexpr uint64_t() max()
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1217
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:515
void moveResultTo(std::vector< Result > &results) noexcept
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
IterationLogic(IterationLogic &&)=delete
IterationLogic & operator=(IterationLogic const &)=delete
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept
IterationLogic(IterationLogic const &)=delete
IterationLogic & operator=(IterationLogic &&)=delete
PerformanceCounters(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters &&)=delete
ANKERL_NANOBENCH(NODISCARD) PerfCountSet< uint64_t > const &val() const noexcept
PerformanceCounters(PerformanceCounters &&)=delete
SetupRunner(SetupOp setupOp, Bench &bench)
Definition: nanobench.h:1251
volatile double sum
Definition: examples.cpp:10
#define T(expected, seed, data)
void doNotOptimizeAway(T &val)
Definition: nanobench.h:1081
PerformanceCounters & performanceCounters()
void doNotOptimizeAway(T const &val)
Definition: nanobench.h:1075
char const * json() noexcept
Template to generate JSON data.
char const * csv() noexcept
CSV data for the benchmark results.
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzation.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:148
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1353
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:97
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:116
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:59
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:603
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:608
const char * name
Definition: rest.cpp:50
static RPCMethod stop()
Definition: server.cpp:145
Config & operator=(Config const &other)
Config(Config const &other)
Config & operator=(Config &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
Config(Config &&other) noexcept
static SECP256K1_INLINE uint64_t rotl(const uint64_t x, int k)
Definition: testrand_impl.h:40
static int setup(void)
Definition: tests.c:8056
static int count
assert(!tx.IsCoinBase())