30#ifndef ANKERL_NANOBENCH_H_INCLUDED
31#define ANKERL_NANOBENCH_H_INCLUDED
34#define ANKERL_NANOBENCH_VERSION_MAJOR 4
35#define ANKERL_NANOBENCH_VERSION_MINOR 3
36#define ANKERL_NANOBENCH_VERSION_PATCH 11
55#include <unordered_map>
59#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
61#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
62#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
63#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
64#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
65#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
67#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
68# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
70# define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
74# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
75 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
76# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
78# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
79# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
83# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
84# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
86# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
87# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
90#if defined(ANKERL_NANOBENCH_LOG_ENABLED)
92# define ANKERL_NANOBENCH_LOG(x) \
94 std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
97# define ANKERL_NANOBENCH_LOG(x) \
102#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
103#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
104# include <linux/version.h>
105# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
108# undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
109# define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
113#if defined(__clang__)
114# define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
116# define ANKERL_NANOBENCH_NO_SANITIZE(...)
120# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
122# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
127#if defined(__GNUC__) && __GNUC__ < 5
128# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
130# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
135#define ANKERL_NANOBENCH_PRIVATE_NOEXCEPT_STRING_MOVE() std::is_nothrow_move_assignable<std::string>::value
147using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
148 std::chrono::steady_clock>::type;
156template <
typename SetupOp>
311void render(std::string
const& mustacheTemplate,
Bench const& bench, std::ostream&
out);
321void render(
char const* mustacheTemplate, std::vector<Result>
const& results, std::ostream&
out);
322void render(std::string
const& mustacheTemplate, std::vector<Result>
const& results, std::ostream&
out);
336char const*
csv() noexcept;
377class PerformanceCounters;
379#if ANKERL_NANOBENCH(PERF_COUNTERS)
380class LinuxPerformanceCounters;
410 std::string mBenchmarkTitle =
"benchmark";
411 std::string mBenchmarkName =
"noname";
412 std::string mUnit =
"op";
414 double mComplexityN = -1.0;
415 size_t mNumEpochs = 11;
416 size_t mClockResolutionMultiple =
static_cast<size_t>(1000);
417 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
418 std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1);
419 uint64_t mMinEpochIterations{1};
421 uint64_t mEpochIterations{0};
422 uint64_t mWarmup = 0;
423 std::ostream* mOut =
nullptr;
424 std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1};
425 std::string mTimeUnitName =
"ns";
426 bool mShowPerformanceCounters =
true;
427 bool mIsRelative =
false;
428 std::unordered_map<std::string, std::string> mContext{};
489 std::vector<std::vector<double>> mNameToMeasurements{};
517 static constexpr uint64_t(min)();
518 static constexpr uint64_t(max)();
534 Rng& operator=(
Rng&&) noexcept = default;
535 ~
Rng() noexcept = default;
562 explicit
Rng(uint64_t seed) noexcept;
563 Rng(uint64_t x, uint64_t y) noexcept;
578 inline uint64_t operator()() noexcept;
596 inline uint32_t bounded(uint32_t range) noexcept;
607 inline
double uniform01() noexcept;
616 template <typename Container>
617 void shuffle(Container& container) noexcept;
628 static constexpr uint64_t
rotl(uint64_t x,
unsigned k) noexcept;
680 template <typename Op>
682 Bench& run(
char const* benchmarkName, Op&& op);
684 template <typename Op>
686 Bench& run(
std::
string const& benchmarkName, Op&& op);
692 template <typename Op>
701 Bench& title(
char const* benchmarkTitle);
726 Bench& context(
char const* variableName,
char const* variableValue);
727 Bench& context(
std::
string const& variableName,
std::
string const& variableValue);
748 template <typename T>
749 Bench& batch(T b) noexcept;
773 Bench& timeUnit(
std::chrono::duration<
double> const& tu,
std::
string const& tuName);
775 ANKERL_NANOBENCH(NODISCARD)
std::chrono::duration<
double> const& timeUnit() const noexcept;
807 Bench& clockResolutionMultiple(
size_t multiple) noexcept;
825 Bench& epochs(
size_t numEpochs) noexcept;
838 Bench& maxEpochTime(
std::chrono::nanoseconds t) noexcept;
851 Bench& minEpochTime(
std::chrono::nanoseconds t) noexcept;
864 Bench& minEpochIterations(uint64_t numIters) noexcept;
873 Bench& epochIterations(uint64_t numIters) noexcept;
885 Bench& warmup(uint64_t numWarmupIters) noexcept;
905 Bench& relative(
bool isRelativeEnabled) noexcept;
936 template <typename Arg>
953 template <typename T>
954 Bench& complexityN(T n) noexcept;
1013 template <typename Op>
1014 BigO complexityBigO(
char const*
name, Op op) const;
1016 template <typename Op>
1017 BigO complexityBigO(
std::
string const&
name, Op op) const;
1037 template <typename SetupOp>
1038 detail::SetupRunner<SetupOp>
setup(SetupOp setupOp);
1041 template <typename SetupOp, typename Op>
1042 Bench& runImpl(SetupOp& setupOp, Op&& op);
1044 template <typename SetupOp>
1048 std::vector<Result> mResults{};
1058template <
typename Arg>
1063#if defined(_MSC_VER)
1064void doNotOptimizeAwaySink(
void const*);
1066template <
typename T>
1074template <
typename T>
1077 asm volatile(
"" : :
"r,m"(val) :
"memory");
1080template <
typename T>
1082# if defined(__clang__)
1084 asm volatile(
"" :
"+r,m"(val) : :
"memory");
1087 asm volatile(
"" :
"+m,r"(val) : :
"memory");
1134#if ANKERL_NANOBENCH(PERF_COUNTERS)
1135 LinuxPerformanceCounters* mPc =
nullptr;
1151 template <
typename Op>
1153 for (
auto& rangeMeasure :
data) {
1154 rangeMeasure.first = op(rangeMeasure.first);
1161 template <
typename Op>
1163 :
BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1165 template <
typename Op>
1167 :
BigO(
std::move(bigOName), mapRangeMeasure(rangeMeasure, rangeToN)) {}
1179 double mNormalizedRootMeanSquare{};
1182std::ostream&
operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO>
const& bigOs);
1190namespace nanobench {
1197 return (std::numeric_limits<uint64_t>::max)();
1201uint64_t
Rng::operator()() noexcept {
1204 mX = UINT64_C(15241094284759029579) * mY;
1205 mY =
rotl(mY - x, 27);
1211uint32_t
Rng::bounded(uint32_t range) noexcept {
1212 uint64_t
const r32 =
static_cast<uint32_t
>(operator()());
1213 auto multiresult = r32 * range;
1214 return static_cast<uint32_t
>(multiresult >> 32U);
1218 auto i = (UINT64_C(0x3ff) << 52U) | (
operator()() >> 12U);
1222 std::memcpy(&d, &i,
sizeof(
double));
1226template <
typename Container>
1228 auto i = container.size();
1231 auto n = operator()();
1233 auto b1 =
static_cast<decltype(i)
>((
static_cast<uint32_t
>(n) *
static_cast<uint64_t
>(i)) >> 32U);
1234 swap(container[--i], container[b1]);
1236 auto b2 =
static_cast<decltype(i)
>(((n >> 32U) *
static_cast<uint64_t
>(i)) >> 32U);
1237 swap(container[--i], container[b2]);
1242constexpr uint64_t
Rng::
rotl(uint64_t x,
unsigned k) noexcept {
1243 return (x <<
k) | (x >> (64U -
k));
1248template <
typename SetupOp>
1252 : mSetupOp(
std::move(setupOp))
1255 template <
typename Op>
1258 assert((mBench.epochIterations() <= 1) &&
1259 "setup() runs once per epoch, not once per iteration; it requires epochIterations(1)");
1260 mBench.epochIterations(1);
1261 return mBench.runImpl(mSetupOp, std::forward<Op>(op));
1270template <
typename Op>
1273 auto setupOp = [] {};
1274 return runImpl(setupOp, std::forward<Op>(op));
1277template <
typename SetupOp,
typename Op>
1284 while (
auto n = iterationLogic.numIters()) {
1288 Clock::time_point
const before = Clock::now();
1292 Clock::time_point
const after = Clock::now();
1294 pc.updateResults(iterationLogic.numIters());
1295 iterationLogic.
add(after - before, pc);
1301template <
typename SetupOp>
1307template <
typename Op>
1309 name(benchmarkName);
1310 return run(std::forward<Op>(op));
1313template <
typename Op>
1315 name(benchmarkName);
1316 return run(std::forward<Op>(op));
1319template <
typename Op>
1324template <
typename Op>
1331template <
typename T>
1333 mConfig.mBatch =
static_cast<double>(b);
1338template <
typename T>
1340 mConfig.mComplexityN =
static_cast<double>(n);
1345template <
typename Arg>
1352template <
typename Arg>
1359#if defined(_MSC_VER)
1360template <
typename T>
1362 doNotOptimizeAwaySink(&val);
1371#if defined(ANKERL_NANOBENCH_IMPLEMENT)
1377# include <algorithm>
1382# include <functional>
1390# include <stdexcept>
1392# if defined(__linux__)
1395# if ANKERL_NANOBENCH(PERF_COUNTERS)
1398# include <linux/perf_event.h>
1399# include <sys/ioctl.h>
1400# include <sys/syscall.h>
1401# include <sys/types.h>
1412namespace nanobench {
1421class StreamStateRestorer;
1423class MarkDownColumn;
1436namespace nanobench {
1438uint64_t splitMix64(uint64_t& state)
noexcept;
1443template <
typename T>
1444inline double d(T t)
noexcept {
1445 return static_cast<double>(
t);
1447inline double d(Clock::duration duration)
noexcept {
1448 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).
count();
1452inline Clock::duration clockResolution() noexcept;
1456namespace templates {
1458char const*
csv() noexcept {
1459 return R
"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1460{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1465 return R
"DELIM(<html>
1468 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1472 <div id="myDiv"></div>
1477 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1481 var title = '{{title}}';
1483 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1484 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1491char const*
pyperf() noexcept {
1498{{#measurement}} {{elapsed}}{{^-last}},
1499{{/last}}{{/measurement}}
1506 "loops": {{sum(iterations)}},
1507 "inner_loops": {{batch}},
1508 "name": "{{title}}",
1515char const*
json() noexcept {
1519 "title": "{{title}}",
1523 "complexityN": {{complexityN}},
1524 "epochs": {{epochs}},
1525 "clockResolution": {{clockResolution}},
1526 "clockResolutionMultiple": {{clockResolutionMultiple}},
1527 "maxEpochTime": {{maxEpochTime}},
1528 "minEpochTime": {{minEpochTime}},
1529 "minEpochIterations": {{minEpochIterations}},
1530 "epochIterations": {{epochIterations}},
1531 "warmup": {{warmup}},
1532 "relative": {{relative}},
1533 "median(elapsed)": {{median(elapsed)}},
1534 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1535 "median(instructions)": {{median(instructions)}},
1536 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1537 "median(cpucycles)": {{median(cpucycles)}},
1538 "median(contextswitches)": {{median(contextswitches)}},
1539 "median(pagefaults)": {{median(pagefaults)}},
1540 "median(branchinstructions)": {{median(branchinstructions)}},
1541 "median(branchmisses)": {{median(branchmisses)}},
1542 "totalTime": {{sumProduct(iterations, elapsed)}},
1545 "iterations": {{iterations}},
1546 "elapsed": {{elapsed}},
1547 "pagefaults": {{pagefaults}},
1548 "cpucycles": {{cpucycles}},
1549 "contextswitches": {{contextswitches}},
1550 "instructions": {{instructions}},
1551 "branchinstructions": {{branchinstructions}},
1552 "branchmisses": {{branchmisses}}
1553 }{{^-last}},{{/-last}}
1555 }{{^-last}},{{/-last}}
1562 enum class Type { tag, content, section, inverted_section };
1566 std::vector<Node> children;
1571 bool operator==(
char const (&str)[N])
const noexcept {
1573 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1579static std::vector<Node> parseMustacheTemplate(
char const** tpl) {
1580 std::vector<Node> nodes;
1583 auto const* begin = std::strstr(*tpl,
"{{");
1584 auto const* end = begin;
1585 if (begin !=
nullptr) {
1588 end = std::strstr(begin,
"}}");
1591 if (begin ==
nullptr || end ==
nullptr) {
1594 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1599 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1611 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1616 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1620 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1626static bool generateFirstLast(Node
const& n,
size_t idx,
size_t size, std::ostream&
out) {
1628 bool const matchFirst = n ==
"-first";
1629 bool const matchLast = n ==
"-last";
1630 if (!matchFirst && !matchLast) {
1634 bool doWrite =
false;
1635 if (n.type == Node::Type::section) {
1636 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1637 }
else if (n.type == Node::Type::inverted_section) {
1638 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1642 for (
auto const& child : n.children) {
1643 if (child.type == Node::Type::content) {
1644 out.write(child.begin, std::distance(child.begin, child.end));
1651static bool matchCmdArgs(std::string
const& str, std::vector<std::string>& matchResult) {
1652 matchResult.clear();
1653 auto idxOpen = str.find(
'(');
1654 auto idxClose = str.find(
')', idxOpen);
1655 if (idxClose == std::string::npos) {
1659 matchResult.emplace_back(str.substr(0, idxOpen));
1662 matchResult.emplace_back();
1663 for (
size_t i = idxOpen + 1; i != idxClose; ++i) {
1664 if (str[i] ==
' ' || str[i] ==
'\t') {
1668 if (str[i] ==
',') {
1670 matchResult.emplace_back();
1674 matchResult.back() += str[i];
1679static bool generateConfigTag(Node
const& n, Config
const& config, std::ostream&
out) {
1683 out << config.mBenchmarkTitle;
1687 out << config.mBenchmarkName;
1691 out << config.mUnit;
1695 out << config.mBatch;
1698 if (n ==
"complexityN") {
1699 out << config.mComplexityN;
1702 if (n ==
"epochs") {
1703 out << config.mNumEpochs;
1706 if (n ==
"clockResolution") {
1707 out << d(detail::clockResolution());
1710 if (n ==
"clockResolutionMultiple") {
1711 out << config.mClockResolutionMultiple;
1714 if (n ==
"maxEpochTime") {
1715 out << d(config.mMaxEpochTime);
1718 if (n ==
"minEpochTime") {
1719 out << d(config.mMinEpochTime);
1722 if (n ==
"minEpochIterations") {
1723 out << config.mMinEpochIterations;
1726 if (n ==
"epochIterations") {
1727 out << config.mEpochIterations;
1730 if (n ==
"warmup") {
1731 out << config.mWarmup;
1734 if (n ==
"relative") {
1735 out << config.mIsRelative;
1742static std::ostream& generateResultTag(Node
const& n,
Result const& r, std::ostream&
out) {
1743 if (generateConfigTag(n, r.config(),
out)) {
1751 std::vector<std::string> matchResult;
1752 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1753 if (matchResult.size() == 2) {
1754 if (matchResult[0] ==
"context") {
1755 return out << r.context(matchResult[1]);
1763 if (matchResult[0] ==
"median") {
1764 return out << r.median(m);
1766 if (matchResult[0] ==
"average") {
1767 return out << r.average(m);
1769 if (matchResult[0] ==
"medianAbsolutePercentError") {
1770 return out << r.medianAbsolutePercentError(m);
1772 if (matchResult[0] ==
"sum") {
1773 return out << r.sum(m);
1775 if (matchResult[0] ==
"minimum") {
1776 return out << r.minimum(m);
1778 if (matchResult[0] ==
"maximum") {
1779 return out << r.maximum(m);
1781 }
else if (matchResult.size() == 3) {
1788 if (matchResult[0] ==
"sumProduct") {
1789 return out << r.sumProduct(m1, m2);
1798 throw std::runtime_error(
"command '" + std::string(n.begin, n.end) +
"' not understood");
1801static void generateResultMeasurement(std::vector<Node>
const& nodes,
size_t idx,
Result const& r, std::ostream&
out) {
1802 for (
auto const& n : nodes) {
1803 if (!generateFirstLast(n, idx, r.size(),
out)) {
1806 case Node::Type::content:
1807 out.write(n.begin, std::distance(n.begin, n.end));
1810 case Node::Type::inverted_section:
1811 throw std::runtime_error(
"got a inverted section inside measurement");
1813 case Node::Type::section:
1814 throw std::runtime_error(
"got a section inside measurement");
1816 case Node::Type::tag: {
1821 out << r.get(idx, m);
1830static void generateResult(std::vector<Node>
const& nodes,
size_t idx, std::vector<Result>
const& results, std::ostream&
out) {
1831 auto const& r = results[idx];
1832 for (
auto const& n : nodes) {
1833 if (!generateFirstLast(n, idx, results.size(),
out)) {
1836 case Node::Type::content:
1837 out.write(n.begin, std::distance(n.begin, n.end));
1840 case Node::Type::inverted_section:
1841 throw std::runtime_error(
"got a inverted section inside result");
1843 case Node::Type::section:
1844 if (n ==
"measurement") {
1845 for (
size_t i = 0; i < r.size(); ++i) {
1846 generateResultMeasurement(n.children, i, r,
out);
1849 throw std::runtime_error(
"got a section inside result");
1853 case Node::Type::tag:
1854 generateResultTag(n, r,
out);
1866char const* getEnv(
char const*
name);
1867bool isEndlessRunning(std::string
const&
name);
1868bool isWarningsEnabled();
1870template <
typename T>
1871T parseFile(std::string
const& filename,
bool* fail);
1873void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1874void printStabilityInformationOnce(std::ostream* outStream);
1877uint64_t& singletonHeaderHash() noexcept;
1880Clock::duration calcClockResolution(
size_t numEvaluations) noexcept;
1887class NumSep :
public std::numpunct<char> {
1889 explicit NumSep(
char sep);
1890 char do_thousands_sep()
const override;
1891 std::string do_grouping()
const override;
1900class StreamStateRestorer {
1902 explicit StreamStateRestorer(std::ostream&
s);
1903 ~StreamStateRestorer();
1909 StreamStateRestorer(StreamStateRestorer
const&) =
delete;
1910 StreamStateRestorer& operator=(StreamStateRestorer
const&) =
delete;
1911 StreamStateRestorer(StreamStateRestorer&&) =
delete;
1912 StreamStateRestorer& operator=(StreamStateRestorer&&) =
delete;
1915 std::ostream& mStream;
1916 std::locale mLocale;
1917 std::streamsize
const mPrecision;
1918 std::streamsize
const mWidth;
1919 std::ostream::char_type
const mFill;
1920 std::ostream::fmtflags
const mFmtFlags;
1927 Number(
int width,
int precision,
double value);
1928 Number(
int width,
int precision, int64_t value);
1932 friend std::ostream&
operator<<(std::ostream& os, Number
const& n);
1933 std::ostream& write(std::ostream& os)
const;
1941std::string to_s(uint64_t n);
1943std::ostream&
operator<<(std::ostream& os, Number
const& n);
1945class MarkDownColumn {
1947 MarkDownColumn(
int w,
int prec, std::string tit, std::string suff,
double val)
noexcept;
1957 std::string mSuffix;
1964 explicit MarkDownCode(std::string
const& what);
1967 friend std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode);
1968 std::ostream& write(std::ostream& os)
const;
1970 std::string mWhat{};
1973std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode);
1983namespace nanobench {
1986void render(
char const* mustacheTemplate, std::vector<Result>
const& results, std::ostream&
out) {
1987 detail::fmt::StreamStateRestorer
const restorer(
out);
1989 out.precision(std::numeric_limits<double>::digits10);
1990 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1992 for (
auto const& n : nodes) {
1995 case templates::Node::Type::content:
1996 out.write(n.begin, std::distance(n.begin, n.end));
1999 case templates::Node::Type::inverted_section:
2000 throw std::runtime_error(
"unknown list '" + std::string(n.begin, n.end) +
"'");
2002 case templates::Node::Type::section:
2003 if (n ==
"result") {
2004 const size_t nbResults = results.size();
2005 for (
size_t i = 0; i < nbResults; ++i) {
2006 generateResult(n.children, i, results,
out);
2008 }
else if (n ==
"measurement") {
2009 if (results.size() != 1) {
2010 throw std::runtime_error(
2011 "render: can only use section 'measurement' here if there is a single result, but there are " +
2012 detail::fmt::to_s(results.size()));
2015 auto const& r = results.front();
2016 for (
size_t i = 0; i < r.size(); ++i) {
2017 generateResultMeasurement(n.children, i, r,
out);
2020 throw std::runtime_error(
"render: unknown section '" + std::string(n.begin, n.end) +
"'");
2024 case templates::Node::Type::tag:
2025 if (results.size() == 1) {
2027 generateResultTag(n, results.front(),
out);
2030 if (!generateConfigTag(n, results.back().config(),
out)) {
2031 throw std::runtime_error(
"unknown tag '" + std::string(n.begin, n.end) +
"'");
2039void render(std::string
const& mustacheTemplate, std::vector<Result>
const& results, std::ostream&
out) {
2040 render(mustacheTemplate.c_str(), results,
out);
2043void render(
char const* mustacheTemplate,
const Bench& bench, std::ostream&
out) {
2044 render(mustacheTemplate, bench.results(),
out);
2047void render(std::string
const& mustacheTemplate,
const Bench& bench, std::ostream&
out) {
2048 render(mustacheTemplate.c_str(), bench.results(),
out);
2054# if defined(__clang__)
2055# pragma clang diagnostic push
2056# pragma clang diagnostic ignored "-Wexit-time-destructors"
2058 static PerformanceCounters pc;
2059# if defined(__clang__)
2060# pragma clang diagnostic pop
2069# if defined(_MSC_VER)
2070# pragma optimize("", off)
2071void doNotOptimizeAwaySink(
void const*) {}
2072# pragma optimize("", on)
2075template <
typename T>
2076T parseFile(std::string
const& filename,
bool* fail) {
2077 std::ifstream fin(filename);
2080 if (fail !=
nullptr) {
2086char const* getEnv(
char const*
name) {
2087# if defined(_MSC_VER)
2088# pragma warning(push)
2089# pragma warning(disable : 4996)
2091 return std::getenv(
name);
2092# if defined(_MSC_VER)
2093# pragma warning(pop)
2097bool isEndlessRunning(std::string
const&
name) {
2098 auto const*
const endless = getEnv(
"NANOBENCH_ENDLESS");
2099 return nullptr != endless && endless ==
name;
2103bool isWarningsEnabled() {
2104 auto const*
const suppression = getEnv(
"NANOBENCH_SUPPRESS_WARNINGS");
2105 return nullptr == suppression || suppression == std::string(
"0");
2108void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
2110 recommendations.clear();
2113 warnings.emplace_back(
"DEBUG defined");
2114 bool const recommendCheckFlags =
true;
2116 bool const recommendCheckFlags =
false;
2119 bool recommendPyPerf =
false;
2120# if defined(__linux__)
2121 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
2123 warnings.emplace_back(
"couldn't figure out number of processors - no governor, turbo check possible");
2126 for (
long id = 0;
id < nprocs; ++id) {
2127 auto idStr = detail::fmt::to_s(
static_cast<uint64_t
>(
id));
2128 auto sysCpu =
"/sys/devices/system/cpu/cpu" + idStr;
2129 auto minFreq = parseFile<int64_t>(sysCpu +
"/cpufreq/scaling_min_freq",
nullptr);
2130 auto maxFreq = parseFile<int64_t>(sysCpu +
"/cpufreq/scaling_max_freq",
nullptr);
2131 if (minFreq != maxFreq) {
2132 auto minMHz = d(minFreq) / 1000.0;
2133 auto maxMHz = d(maxFreq) / 1000.0;
2134 warnings.emplace_back(
"CPU frequency scaling enabled: CPU " + idStr +
" between " +
2135 detail::fmt::Number(1, 1, minMHz).to_s() +
" and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
2137 recommendPyPerf =
true;
2143 auto currentGovernor = parseFile<std::string>(
"/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", &fail);
2144 if (!fail &&
"performance" != currentGovernor) {
2145 warnings.emplace_back(
"CPU governor is '" + currentGovernor +
"' but should be 'performance'");
2146 recommendPyPerf =
true;
2149 auto noTurbo = parseFile<int>(
"/sys/devices/system/cpu/intel_pstate/no_turbo", &fail);
2150 if (!fail && noTurbo == 0) {
2151 warnings.emplace_back(
"Turbo is enabled, CPU frequency will fluctuate");
2152 recommendPyPerf =
true;
2157 if (recommendCheckFlags) {
2158 recommendations.emplace_back(
"Make sure you compile for Release");
2160 if (recommendPyPerf) {
2161 recommendations.emplace_back(
"Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
2165void printStabilityInformationOnce(std::ostream* outStream) {
2166 static bool shouldPrint =
true;
2167 if (shouldPrint && (
nullptr != outStream) && isWarningsEnabled()) {
2168 auto& os = *outStream;
2169 shouldPrint =
false;
2170 std::vector<std::string> warnings;
2171 std::vector<std::string> recommendations;
2172 gatherStabilityInformation(warnings, recommendations);
2173 if (warnings.empty()) {
2177 os <<
"Warning, results might be unstable:" << std::endl;
2178 for (
auto const& w : warnings) {
2179 os <<
"* " << w << std::endl;
2182 os << std::endl <<
"Recommendations" << std::endl;
2183 for (
auto const& r : recommendations) {
2184 os <<
"* " << r << std::endl;
2190uint64_t& singletonHeaderHash() noexcept {
2191 static uint64_t sHeaderHash{};
2196inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2197 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2201Clock::duration calcClockResolution(
size_t numEvaluations)
noexcept {
2202 auto bestDuration = Clock::duration::max();
2203 Clock::time_point tBegin;
2204 Clock::time_point tEnd;
2205 for (
size_t i = 0; i < numEvaluations; ++i) {
2206 tBegin = Clock::now();
2208 tEnd = Clock::now();
2209 }
while (tBegin == tEnd);
2210 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2212 return bestDuration;
2216Clock::duration clockResolution() noexcept {
2217 static Clock::duration
const sResolution = calcClockResolution(20);
2222struct IterationLogic::Impl {
2223 enum class State { warmup, upscaling_runtime, measuring, endless };
2225 explicit Impl(Bench
const& bench)
2227 , mResult(bench.config()) {
2228 printStabilityInformationOnce(mBench.output());
2231 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2232 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2233 mTargetRuntimePerEpoch = mBench.maxEpochTime();
2235 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2236 mTargetRuntimePerEpoch = mBench.minEpochTime();
2239 if (isEndlessRunning(mBench.name())) {
2240 std::cerr <<
"NANOBENCH_ENDLESS set: running '" << mBench.name() <<
"' endlessly" << std::endl;
2241 mNumIters = (std::numeric_limits<uint64_t>::max)();
2242 mState = State::endless;
2243 }
else if (0 != mBench.warmup()) {
2244 mNumIters = mBench.warmup();
2245 mState = State::warmup;
2246 }
else if (0 != mBench.epochIterations()) {
2248 mNumIters = mBench.epochIterations();
2249 mState = State::measuring;
2251 mNumIters = mBench.minEpochIterations();
2252 mState = State::upscaling_runtime;
2257 ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters)
noexcept {
2258 auto doubleElapsed = d(elapsed);
2259 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2260 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2262 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2263 if (doubleNewIters < doubleMinEpochIters) {
2264 doubleNewIters = doubleMinEpochIters;
2266 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2270 return static_cast<uint64_t
>(doubleNewIters + 0.5);
2274 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2276 if (mNumIters * 10 < mNumIters) {
2278 showResult(
"iterations overflow. Maybe your code got optimized away?");
2284 mNumIters = calcBestNumIters(elapsed, mNumIters);
2288 void add(std::chrono::nanoseconds elapsed, PerformanceCounters
const& pc)
noexcept {
2289# if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2290 auto oldIters = mNumIters;
2295 if (isCloseEnoughForMeasurements(elapsed)) {
2298 mState = State::measuring;
2299 mNumIters = calcBestNumIters(elapsed, mNumIters);
2302 mState = State::upscaling_runtime;
2307 case State::upscaling_runtime:
2308 if (isCloseEnoughForMeasurements(elapsed)) {
2310 mState = State::measuring;
2311 mTotalElapsed += elapsed;
2312 mTotalNumIters += mNumIters;
2313 mResult.add(elapsed, mNumIters, pc);
2314 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2320 case State::measuring:
2323 mTotalElapsed += elapsed;
2324 mTotalNumIters += mNumIters;
2325 mResult.add(elapsed, mNumIters, pc);
2326 if (0 != mBench.epochIterations()) {
2327 mNumIters = mBench.epochIterations();
2329 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2333 case State::endless:
2334 mNumIters = (std::numeric_limits<uint64_t>::max)();
2338 if (
static_cast<uint64_t
>(mResult.size()) == mBench.epochs()) {
2344 ANKERL_NANOBENCH_LOG(mBench.name() <<
": " << detail::fmt::Number(20, 3, d(elapsed.count())) <<
" elapsed, "
2345 << detail::fmt::Number(20, 3, d(mTargetRuntimePerEpoch.count())) <<
" target. oldIters="
2346 << oldIters <<
", mNumIters=" << mNumIters <<
", mState=" <<
static_cast<int>(mState));
2350 void showResult(std::string
const& errorMessage)
const {
2353 if (mBench.output() !=
nullptr) {
2355 std::vector<fmt::MarkDownColumn> columns;
2359 if (mBench.relative()) {
2361 if (!mBench.results().empty()) {
2364 columns.emplace_back(11, 1,
"relative",
"%", d);
2367 if (mBench.complexityN() > 0) {
2368 columns.emplace_back(14, 0,
"complexityN",
"", mBench.complexityN());
2371 columns.emplace_back(22, 2, mBench.timeUnitName() +
"/" + mBench.unit(),
"",
2372 rMedian / (mBench.timeUnit().count() * mBench.batch()));
2373 columns.emplace_back(22, 2, mBench.unit() +
"/s",
"", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2376 columns.emplace_back(10, 1,
"err%",
"%", rErrorMedian * 100.0);
2378 double rInsMedian = -1.0;
2381 columns.emplace_back(18, 2,
"ins/" + mBench.unit(),
"", rInsMedian / mBench.batch());
2384 double rCycMedian = -1.0;
2387 columns.emplace_back(18, 2,
"cyc/" + mBench.unit(),
"", rCycMedian / mBench.batch());
2389 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2390 columns.emplace_back(9, 3,
"IPC",
"", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2394 columns.emplace_back(17, 2,
"bra/" + mBench.unit(),
"", rBraMedian / mBench.batch());
2397 if (rBraMedian >= 1e-9) {
2400 columns.emplace_back(10, 1,
"miss%",
"%", p);
2407 auto& os = *mBench.output();
2411 hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2412 hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2413 hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2414 hash = hash_combine(std::hash<double>{}(mBench.timeUnit().
count()), hash);
2415 hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2416 hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2418 if (hash != singletonHeaderHash()) {
2419 singletonHeaderHash() = hash;
2423 for (
auto const& col : columns) {
2426 os <<
"| " << mBench.title() << std::endl;
2428 for (
auto const& col : columns) {
2429 os << col.separator();
2431 os <<
"|:" << std::string(mBench.title().size() + 1U,
'-') << std::endl;
2434 if (!errorMessage.empty()) {
2435 for (
auto const& col : columns) {
2436 os << col.invalid();
2438 os <<
"| :boom: " << fmt::MarkDownCode(mBench.name()) <<
" (" << errorMessage <<
')' << std::endl;
2440 for (
auto const& col : columns) {
2444 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2446 os <<
":wavy_dash: ";
2448 os << fmt::MarkDownCode(mBench.name());
2450 auto avgIters = d(mTotalNumIters) / d(mBench.epochs());
2452 auto suggestedIters =
static_cast<uint64_t
>(avgIters * 10 + 0.5);
2454 os <<
" (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2455 <<
" iters. Increase `minEpochIterations` to e.g. " << suggestedIters <<
")";
2462 ANKERL_NANOBENCH(NODISCARD)
bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed)
const noexcept {
2463 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2466 uint64_t mNumIters = 1;
2467 Bench
const& mBench;
2468 std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2471 std::chrono::nanoseconds mTotalElapsed{};
2472 uint64_t mTotalNumIters = 0;
2473 State mState = State::upscaling_runtime;
2477IterationLogic::IterationLogic(Bench
const& bench)
2478 : mPimpl(new Impl(bench)) {}
2480IterationLogic::~IterationLogic() {
2484uint64_t IterationLogic::numIters() const noexcept {
2486 return mPimpl->mNumIters;
2489void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters
const& pc)
noexcept {
2490 mPimpl->add(elapsed, pc);
2493void IterationLogic::moveResultTo(std::vector<Result>& results)
noexcept {
2494 results.emplace_back(std::move(mPimpl->mResult));
2497# if ANKERL_NANOBENCH(PERF_COUNTERS)
2500class LinuxPerformanceCounters {
2503 Target(uint64_t* targetValue_,
bool correctMeasuringOverhead_,
bool correctLoopOverhead_)
2504 : targetValue(targetValue_)
2505 , correctMeasuringOverhead(correctMeasuringOverhead_)
2506 , correctLoopOverhead(correctLoopOverhead_) {}
2508 uint64_t* targetValue{};
2509 bool correctMeasuringOverhead{};
2510 bool correctLoopOverhead{};
2513 LinuxPerformanceCounters() =
default;
2514 LinuxPerformanceCounters(LinuxPerformanceCounters
const&) =
delete;
2515 LinuxPerformanceCounters(LinuxPerformanceCounters&&) =
delete;
2516 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters
const&) =
delete;
2517 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters&&) =
delete;
2518 ~LinuxPerformanceCounters();
2521 inline void start() {}
2523 inline void stop() {}
2525 bool monitor(perf_sw_ids swId, Target target);
2526 bool monitor(perf_hw_id hwId, Target target);
2534 inline void beginMeasure() {
2540 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2546 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2549 inline void endMeasure() {
2555 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2560 auto const numBytes =
sizeof(uint64_t) * mCounters.size();
2561 auto ret = read(mFd, mCounters.data(), numBytes);
2562 mHasError =
ret !=
static_cast<ssize_t
>(numBytes);
2565 void updateResults(uint64_t numIters);
2568 template <
typename T>
2569 static inline T divRounded(T a, T divisor) {
2570 return (a + divisor / 2) / divisor;
2574 static inline uint32_t mix(uint32_t x) noexcept {
2581 template <
typename Op>
2583 void calibrate(Op&& op) {
2585 for (
auto& v : mCalibratedOverhead) {
2590 auto newCalibration = mCalibratedOverhead;
2591 for (
auto& v : newCalibration) {
2592 v = (std::numeric_limits<uint64_t>::max)();
2594 for (
size_t iter = 0; iter < 100; ++iter) {
2602 for (
size_t i = 0; i < newCalibration.size(); ++i) {
2603 auto diff = mCounters[i];
2604 if (newCalibration[i] > diff) {
2605 newCalibration[i] = diff;
2610 mCalibratedOverhead = std::move(newCalibration);
2617 uint64_t
const numIters = 100000U + (std::random_device{}() & 3U);
2618 uint64_t n = numIters;
2619 uint32_t x = 1234567;
2627 auto measure1 = mCounters;
2638 auto measure2 = mCounters;
2640 for (
size_t i = 0; i < mCounters.size(); ++i) {
2642 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2643 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2644 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2646 mLoopOverhead[i] = divRounded(overhead, numIters);
2652 bool monitor(uint32_t type, uint64_t eventid, Target target);
2654 std::map<uint64_t, Target> mIdToTarget{};
2657 std::vector<uint64_t> mCounters{3};
2658 std::vector<uint64_t> mCalibratedOverhead{3};
2659 std::vector<uint64_t> mLoopOverhead{3};
2661 uint64_t mTimeEnabledNanos = 0;
2662 uint64_t mTimeRunningNanos = 0;
2664 bool mHasError =
false;
2668LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2674bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2675 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2678bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2679 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2684void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2686 for (
auto& id_value : mIdToTarget) {
2687 *id_value.second.targetValue = UINT64_C(0);
2694 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2695 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2697 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2698 auto idx =
static_cast<size_t>(3 + i * 2 + 0);
2699 auto id = mCounters[idx + 1U];
2701 auto it = mIdToTarget.find(
id);
2702 if (it != mIdToTarget.end()) {
2704 auto& tgt = it->second;
2705 *tgt.targetValue = mCounters[idx];
2706 if (tgt.correctMeasuringOverhead) {
2707 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2708 *tgt.targetValue -= mCalibratedOverhead[idx];
2710 *tgt.targetValue = 0U;
2713 if (tgt.correctLoopOverhead) {
2714 auto correctionVal = mLoopOverhead[idx] * numIters;
2715 if (*tgt.targetValue >= correctionVal) {
2716 *tgt.targetValue -= correctionVal;
2718 *tgt.targetValue = 0U;
2725bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2726 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2731 auto pea = perf_event_attr();
2732 std::memset(&pea, 0,
sizeof(perf_event_attr));
2734 pea.size =
sizeof(perf_event_attr);
2735 pea.config = eventid;
2737 pea.exclude_kernel = 1;
2741 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2745# if defined(PERF_FLAG_FD_CLOEXEC)
2746 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2748 const unsigned long flags = 0;
2752 auto fd =
static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd,
flags));
2762 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &
id)) {
2768 mIdToTarget.emplace(
id, target);
2771 auto size = 3 + 2 * mIdToTarget.size();
2772 mCounters.resize(size);
2773 mCalibratedOverhead.resize(size);
2774 mLoopOverhead.resize(size);
2779PerformanceCounters::PerformanceCounters()
2780 : mPc(new LinuxPerformanceCounters())
2785 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles,
true,
false));
2786 if (!mHas.cpuCycles) {
2788 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles,
true,
false));
2790 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions,
true,
true));
2791 mHas.branchInstructions =
2792 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions,
true,
false));
2793 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses,
true,
false));
2797 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults,
true,
false));
2798 mHas.contextSwitches =
2799 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches,
true,
false));
2803 auto before = ankerl::nanobench::Clock::now();
2804 auto after = ankerl::nanobench::Clock::now();
2809 if (mPc->hasError()) {
2811 mHas = PerfCountSet<bool>{};
2815PerformanceCounters::~PerformanceCounters() {
2820void PerformanceCounters::beginMeasure() {
2821 mPc->beginMeasure();
2824void PerformanceCounters::endMeasure() {
2828void PerformanceCounters::updateResults(uint64_t numIters) {
2829 mPc->updateResults(numIters);
2834PerformanceCounters::PerformanceCounters() =
default;
2835PerformanceCounters::~PerformanceCounters() =
default;
2836void PerformanceCounters::beginMeasure() {}
2837void PerformanceCounters::endMeasure() {}
2838void PerformanceCounters::updateResults(uint64_t) {}
2842ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t>
const& PerformanceCounters::val() const noexcept {
2845ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool>
const& PerformanceCounters::has() const noexcept {
2853NumSep::NumSep(
char sep)
2856char NumSep::do_thousands_sep()
const {
2860std::string NumSep::do_grouping()
const {
2865StreamStateRestorer::StreamStateRestorer(std::ostream&
s)
2867 , mLocale(
s.getloc())
2868 , mPrecision(
s.precision())
2871 , mFmtFlags(
s.
flags()) {}
2873StreamStateRestorer::~StreamStateRestorer() {
2878void StreamStateRestorer::restore() {
2879 mStream.imbue(mLocale);
2880 mStream.precision(mPrecision);
2881 mStream.width(mWidth);
2882 mStream.fill(mFill);
2883 mStream.flags(mFmtFlags);
2886Number::Number(
int width,
int precision, int64_t value)
2888 , mPrecision(precision)
2889 , mValue(d(value)) {}
2891Number::Number(
int width,
int precision,
double value)
2893 , mPrecision(precision)
2896std::ostream& Number::write(std::ostream& os)
const {
2897 StreamStateRestorer
const restorer(os);
2898 os.imbue(std::locale(os.getloc(),
new NumSep(
',')));
2899 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2903std::string Number::to_s()
const {
2904 std::stringstream ss;
2909std::string to_s(uint64_t n) {
2912 str +=
static_cast<char>(
'0' +
static_cast<char>(n % 10));
2915 std::reverse(str.begin(), str.end());
2919std::ostream&
operator<<(std::ostream& os, Number
const& n) {
2923MarkDownColumn::MarkDownColumn(
int w,
int prec, std::string tit, std::string suff,
double val) noexcept
2926 , mTitle(std::move(tit))
2927 , mSuffix(std::move(suff))
2930std::string MarkDownColumn::title()
const {
2931 std::stringstream ss;
2932 ss <<
'|' << std::setw(mWidth - 2) << std::right << mTitle <<
' ';
2936std::string MarkDownColumn::separator()
const {
2937 std::string sep(
static_cast<size_t>(mWidth),
'-');
2943std::string MarkDownColumn::invalid()
const {
2944 std::string sep(
static_cast<size_t>(mWidth),
' ');
2946 sep[sep.size() - 2] =
'-';
2950std::string MarkDownColumn::value()
const {
2951 std::stringstream ss;
2952 auto width = mWidth - 2 -
static_cast<int>(mSuffix.size());
2953 ss <<
'|' << Number(width, mPrecision, mValue) << mSuffix <<
' ';
2958MarkDownCode::MarkDownCode(std::string
const& what) {
2959 mWhat.reserve(what.size() + 2);
2960 mWhat.push_back(
'`');
2961 for (
char const c : what) {
2964 mWhat.push_back(
'`');
2967 mWhat.push_back(
'`');
2970std::ostream& MarkDownCode::write(std::ostream& os)
const {
2974std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode) {
2975 return mdCode.write(os);
2981Config::Config() =
default;
2982Config::~Config() =
default;
2983Config& Config::operator=(Config
const&) =
default;
2984Config& Config::operator=(Config&&) noexcept(
ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
2985Config::Config(Config const&) = default;
2986Config::Config(Config&&) noexcept = default;
2996template <
typename T>
2997inline constexpr typename std::underlying_type<T>::type u(T val)
noexcept {
2998 return static_cast<typename std::underlying_type<T>::type
>(val);
3004 : mConfig(
std::move(benchmarkConfig))
3005 , mNameToMeasurements{
detail::u(
Result::Measure::_size)} {}
3007void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters
const& pc) {
3011 double const dIters = d(iters);
3012 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
3014 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
3015 if (pc.has().pageFaults) {
3016 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
3018 if (pc.has().cpuCycles) {
3019 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
3021 if (pc.has().contextSwitches) {
3022 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
3024 if (pc.has().instructions) {
3025 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
3027 if (pc.has().branchInstructions) {
3028 double branchInstructions = 0.0;
3030 if (pc.val().branchInstructions > iters + 1U) {
3031 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
3033 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
3035 if (pc.has().branchMisses) {
3037 double branchMisses = d(pc.val().branchMisses);
3038 if (branchMisses > branchInstructions) {
3040 branchMisses = branchInstructions;
3044 branchMisses -= 1.0;
3045 if (branchMisses < 1.0) {
3048 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
3053Config
const& Result::config() const noexcept {
3057inline double calcMedian(std::vector<double>&
data) {
3061 std::sort(
data.begin(),
data.end());
3063 auto midIdx =
data.size() / 2U;
3064 if (1U == (
data.size() & 1U)) {
3065 return data[midIdx];
3067 return (
data[midIdx - 1U] +
data[midIdx]) / 2U;
3070double Result::median(Measure m)
const {
3072 auto data = mNameToMeasurements[detail::u(m)];
3073 return calcMedian(
data);
3076double Result::average(Measure m)
const {
3078 auto const&
data = mNameToMeasurements[detail::u(m)];
3084 return sum(m) / d(
data.size());
3087double Result::medianAbsolutePercentError(Measure m)
const {
3089 auto data = mNameToMeasurements[detail::u(m)];
3093 auto med = calcMedian(
data);
3096 for (
auto& x :
data) {
3102 return calcMedian(
data);
3106 auto const&
data = mNameToMeasurements[detail::u(m)];
3107 return std::accumulate(
data.begin(),
data.end(), 0.0);
3110double Result::sumProduct(Measure m1, Measure m2)
const noexcept {
3111 auto const& data1 = mNameToMeasurements[detail::u(m1)];
3112 auto const& data2 = mNameToMeasurements[detail::u(m2)];
3114 if (data1.size() != data2.size()) {
3118 double result = 0.0;
3119 for (
size_t i = 0,
s = data1.size(); i !=
s; ++i) {
3120 result += data1[i] * data2[i];
3125bool Result::has(Measure m)
const noexcept {
3126 return !mNameToMeasurements[detail::u(m)].empty();
3129double Result::get(
size_t idx, Measure m)
const {
3130 auto const&
data = mNameToMeasurements[detail::u(m)];
3131 return data.at(idx);
3134bool Result::empty() const noexcept {
3135 return 0U == size();
3138size_t Result::size() const noexcept {
3139 auto const&
data = mNameToMeasurements[detail::u(Measure::elapsed)];
3143double Result::minimum(Measure m)
const noexcept {
3144 auto const&
data = mNameToMeasurements[detail::u(m)];
3150 return *std::min_element(
data.begin(),
data.end());
3153double Result::maximum(Measure m)
const noexcept {
3154 auto const&
data = mNameToMeasurements[detail::u(m)];
3160 return *std::max_element(
data.begin(),
data.end());
3163std::string
const& Result::context(
char const* variableName)
const {
3164 return mConfig.mContext.at(variableName);
3167std::string
const& Result::context(std::string
const& variableName)
const {
3168 return mConfig.mContext.at(variableName);
3171Result::Measure Result::fromString(std::string
const& str) {
3172 if (str ==
"elapsed") {
3173 return Measure::elapsed;
3175 if (str ==
"iterations") {
3176 return Measure::iterations;
3178 if (str ==
"pagefaults") {
3179 return Measure::pagefaults;
3181 if (str ==
"cpucycles") {
3182 return Measure::cpucycles;
3184 if (str ==
"contextswitches") {
3185 return Measure::contextswitches;
3187 if (str ==
"instructions") {
3188 return Measure::instructions;
3190 if (str ==
"branchinstructions") {
3191 return Measure::branchinstructions;
3193 if (str ==
"branchmisses") {
3194 return Measure::branchmisses;
3197 return Measure::_size;
3202 mConfig.mOut = &std::cout;
3205Bench::Bench(Bench&&) noexcept = default;
3206Bench& Bench::operator=(Bench&&) noexcept(
ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
3207Bench::Bench(Bench const&) = default;
3208Bench& Bench::operator=(Bench const&) = default;
3209Bench::~Bench() noexcept = default;
3211double Bench::batch() const noexcept {
3212 return mConfig.mBatch;
3215double Bench::complexityN() const noexcept {
3216 return mConfig.mComplexityN;
3221Bench& Bench::relative(
bool isRelativeEnabled)
noexcept {
3222 mConfig.mIsRelative = isRelativeEnabled;
3225bool Bench::relative() const noexcept {
3226 return mConfig.mIsRelative;
3230 mConfig.mShowPerformanceCounters = showPerformanceCounters;
3234 return mConfig.mShowPerformanceCounters;
3240Bench& Bench::unit(
char const* u) {
3241 if (u != mConfig.mUnit) {
3248Bench& Bench::unit(std::string
const& u) {
3249 return unit(u.c_str());
3252std::string
const& Bench::unit() const noexcept {
3253 return mConfig.mUnit;
3256Bench& Bench::timeUnit(std::chrono::duration<double>
const& tu, std::string
const& tuName) {
3257 mConfig.mTimeUnit = tu;
3258 mConfig.mTimeUnitName = tuName;
3262std::string
const& Bench::timeUnitName() const noexcept {
3263 return mConfig.mTimeUnitName;
3266std::chrono::duration<double>
const& Bench::timeUnit() const noexcept {
3267 return mConfig.mTimeUnit;
3271Bench& Bench::title(
const char* benchmarkTitle) {
3272 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3275 mConfig.mBenchmarkTitle = benchmarkTitle;
3278Bench& Bench::title(std::string
const& benchmarkTitle) {
3279 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3282 mConfig.mBenchmarkTitle = benchmarkTitle;
3286std::string
const& Bench::title() const noexcept {
3287 return mConfig.mBenchmarkTitle;
3291 mConfig.mBenchmarkName = benchmarkName;
3295Bench&
Bench::name(std::string
const& benchmarkName) {
3296 mConfig.mBenchmarkName = benchmarkName;
3301 return mConfig.mBenchmarkName;
3304Bench& Bench::context(
char const* variableName,
char const* variableValue) {
3305 mConfig.mContext[variableName] = variableValue;
3309Bench& Bench::context(std::string
const& variableName, std::string
const& variableValue) {
3310 mConfig.mContext[variableName] = variableValue;
3314Bench& Bench::clearContext() {
3315 mConfig.mContext.clear();
3320Bench& Bench::epochs(
size_t numEpochs)
noexcept {
3321 mConfig.mNumEpochs = numEpochs;
3324size_t Bench::epochs() const noexcept {
3325 return mConfig.mNumEpochs;
3329Bench& Bench::clockResolutionMultiple(
size_t multiple)
noexcept {
3330 mConfig.mClockResolutionMultiple = multiple;
3333size_t Bench::clockResolutionMultiple() const noexcept {
3334 return mConfig.mClockResolutionMultiple;
3338Bench& Bench::maxEpochTime(std::chrono::nanoseconds t)
noexcept {
3339 mConfig.mMaxEpochTime =
t;
3342std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3343 return mConfig.mMaxEpochTime;
3347Bench& Bench::minEpochTime(std::chrono::nanoseconds t)
noexcept {
3348 mConfig.mMinEpochTime =
t;
3351std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3352 return mConfig.mMinEpochTime;
3355Bench& Bench::minEpochIterations(uint64_t numIters)
noexcept {
3356 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3359uint64_t Bench::minEpochIterations() const noexcept {
3360 return mConfig.mMinEpochIterations;
3363Bench& Bench::epochIterations(uint64_t numIters)
noexcept {
3364 mConfig.mEpochIterations = numIters;
3367uint64_t Bench::epochIterations() const noexcept {
3368 return mConfig.mEpochIterations;
3371Bench& Bench::warmup(uint64_t numWarmupIters)
noexcept {
3372 mConfig.mWarmup = numWarmupIters;
3375uint64_t Bench::warmup() const noexcept {
3376 return mConfig.mWarmup;
3379Bench& Bench::config(Config
const& benchmarkConfig) {
3380 mConfig = benchmarkConfig;
3383Config
const& Bench::config() const noexcept {
3387Bench& Bench::output(std::ostream* outstream)
noexcept {
3388 mConfig.mOut = outstream;
3393 return mConfig.mOut;
3396std::vector<Result>
const& Bench::results() const noexcept {
3400Bench&
Bench::render(
char const* templateContent, std::ostream& os) {
3405Bench&
Bench::render(std::string
const& templateContent, std::ostream& os) {
3410std::vector<BigO> Bench::complexityBigO()
const {
3411 std::vector<BigO> bigOs;
3412 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3413 bigOs.emplace_back(
"O(1)", rangeMeasure, [](
double) {
3416 bigOs.emplace_back(
"O(n)", rangeMeasure, [](
double n) {
3419 bigOs.emplace_back(
"O(log n)", rangeMeasure, [](
double n) {
3420 return std::log2(n);
3422 bigOs.emplace_back(
"O(n log n)", rangeMeasure, [](
double n) {
3423 return n * std::log2(n);
3425 bigOs.emplace_back(
"O(n^2)", rangeMeasure, [](
double n) {
3428 bigOs.emplace_back(
"O(n^3)", rangeMeasure, [](
double n) {
3431 std::sort(bigOs.begin(), bigOs.end());
3438 std::random_device rd;
3439 std::uniform_int_distribution<uint64_t> dist;
3443 }
while (mX == 0 && mY == 0);
3447uint64_t splitMix64(uint64_t& state) noexcept {
3448 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3449 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3450 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3451 return z ^ (z >> 31U);
3455Rng::Rng(uint64_t seed) noexcept
3456 : mX(splitMix64(seed))
3457 , mY(splitMix64(seed)) {
3458 for (
size_t i = 0; i < 10; ++i) {
3464Rng::Rng(uint64_t x, uint64_t y) noexcept
3468Rng Rng::copy() const noexcept {
3472Rng::Rng(std::vector<uint64_t>
const&
data)
3475 if (
data.size() != 2) {
3476 throw std::runtime_error(
"ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3477 detail::fmt::to_s(
data.size()));
3483std::vector<uint64_t> Rng::state()
const {
3484 std::vector<uint64_t>
data(2);
3490BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result>
const& results) {
3491 BigO::RangeMeasure rangeMeasure;
3492 for (
auto const& result : results) {
3493 if (result.config().mComplexityN > 0.0) {
3494 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3497 return rangeMeasure;
3500BigO::BigO(std::string bigOName, RangeMeasure
const& rangeMeasure)
3501 : mName(
std::move(bigOName)) {
3504 double sumRangeMeasure = 0.0;
3505 double sumRangeRange = 0.0;
3507 for (
const auto& rm : rangeMeasure) {
3508 sumRangeMeasure += rm.first * rm.second;
3509 sumRangeRange += rm.first * rm.first;
3511 mConstant = sumRangeMeasure / sumRangeRange;
3515 double sumMeasure = 0.0;
3516 for (
const auto& rm : rangeMeasure) {
3517 auto diff = mConstant * rm.first - rm.second;
3520 sumMeasure += rm.second;
3523 auto n = detail::d(rangeMeasure.size());
3524 auto mean = sumMeasure / n;
3525 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3528BigO::BigO(
const char* bigOName, RangeMeasure
const& rangeMeasure)
3529 : BigO(
std::string(bigOName), rangeMeasure) {}
3531std::string
const&
BigO::name() const noexcept {
3535double BigO::constant() const noexcept {
3539double BigO::normalizedRootMeanSquare() const noexcept {
3540 return mNormalizedRootMeanSquare;
3544 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3547std::ostream&
operator<<(std::ostream& os, BigO
const& bigO) {
3548 return os << bigO.constant() <<
" * " << bigO.name() <<
", rms=" << bigO.normalizedRootMeanSquare();
3551std::ostream&
operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO>
const& bigOs) {
3552 detail::fmt::StreamStateRestorer
const restorer(os);
3553 os << std::endl <<
"| coefficient | err% | complexity" << std::endl <<
"|--------------:|-------:|------------" << std::endl;
3554 for (
auto const& bigO : bigOs) {
3555 os <<
"|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() <<
" ";
3556 os <<
"|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) <<
"% ";
3557 os <<
"| " << bigO.name();
Main entry point to nanobench's benchmarking facility.
Bench & operator=(Bench const &other)
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Bench & batch(T b) noexcept
Sets the batch size.
std::vector< BigO > complexityBigO() const
Bench()
Creates a new benchmark for configuration and running of benchmarks.
Bench & operator=(Bench &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
detail::SetupRunner< SetupOp > setup(SetupOp setupOp)
Configure an untimed setup step per epoch (forces single-iteration epochs).
Bench(Bench &&other) noexcept
Bench(Bench const &other)
Bench & complexityN(T n) noexcept
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
BigO(std::string bigOName, RangeMeasure const &scaledRangeMeasure)
std::vector< std::pair< double, double > > RangeMeasure
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
BigO(std::string bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)
Result(Config benchmarkConfig)
static Measure fromString(std::string const &str)
void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const &pc)
Result(Result &&other) noexcept
ANKERL_NANOBENCH(NODISCARD) Config const &config() const noexcept
Result & operator=(Result const &other)
Result(Result const &other)
Result & operator=(Result &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
An extremely fast random generator.
static constexpr uint64_t() min()
Rng(Rng const &)=delete
As a safety precaution, we don't allow copying.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Rng(Rng &&) noexcept=default
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.
static constexpr uint64_t() max()
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
uint64_t result_type
This RNG provides 64bit randomness.
void moveResultTo(std::vector< Result > &results) noexcept
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
IterationLogic(IterationLogic &&)=delete
IterationLogic & operator=(IterationLogic const &)=delete
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept
IterationLogic(IterationLogic const &)=delete
IterationLogic(Bench const &bench)
IterationLogic & operator=(IterationLogic &&)=delete
SetupRunner(SetupOp setupOp, Bench &bench)
#define T(expected, seed, data)
void doNotOptimizeAway(T &val)
PerformanceCounters & performanceCounters()
void doNotOptimizeAway(T const &val)
char const * json() noexcept
Template to generate JSON data.
char const * csv() noexcept
CSV data for the benchmark results.
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzation.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
#define ANKERL_NANOBENCH_LOG(x)
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
#define ANKERL_NANOBENCH(x)
bool operator==(const CNetAddr &a, const CNetAddr &b)
bool operator<(const CNetAddr &a, const CNetAddr &b)
Config & operator=(Config const &other)
Config(Config const &other)
Config & operator=(Config &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
Config(Config &&other) noexcept
static SECP256K1_INLINE uint64_t rotl(const uint64_t x, int k)