nanobench_8h_source.html

//  __   _ _______ __   _  _____  ______  _______ __   _ _______ _     _

//  | \  | |_____| | \  | |     | |_____] |______ | \  | |       |_____|

//  |  \_| |     | |  \_| |_____| |_____] |______ |  \_| |_____  |     |

//

// Microbenchmark framework for C++11/14/17/20

// https://github.com/martinus/nanobench

//

// Licensed under the MIT License <http://opensource.org/licenses/MIT>.

// SPDX-License-Identifier: MIT

// Copyright (c) 2019-2023 Martin Leitner-Ankerl <martin.ankerl@gmail.com>

//

// Permission is hereby granted, free of charge, to any person obtaining a copy

// of this software and associated documentation files (the "Software"), to deal

// in the Software without restriction, including without limitation the rights

// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

// copies of the Software, and to permit persons to whom the Software is

// furnished to do so, subject to the following conditions:

//

// The above copyright notice and this permission notice shall be included in all

// copies or substantial portions of the Software.

//

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

// SOFTWARE.


#ifndef ANKERL_NANOBENCH_H_INCLUDED

#define ANKERL_NANOBENCH_H_INCLUDED


// see https://semver.org/

#define ANKERL_NANOBENCH_VERSION_MAJOR 4  // incompatible API changes

#define ANKERL_NANOBENCH_VERSION_MINOR 3  // backwards-compatible changes

#define ANKERL_NANOBENCH_VERSION_PATCH 11 // backwards-compatible bug fixes


// public facing api - as minimal as possible


#include <chrono>        // high_resolution_clock

#include <cstring>       // memcpy

#include <iosfwd>        // for std::ostream* custom output target in Config

#include <string>        // all names

#include <unordered_map> // holds context information of results

#include <vector>        // holds all results


#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()


#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus

#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L

#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L

#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L

#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L


#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)

#    define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]

#else

#    define ANKERL_NANOBENCH_PRIVATE_NODISCARD()

#endif


#if defined(__clang__)

#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \

        _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")

#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")

#else

#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()

#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()

#endif


#if defined(__GNUC__)

#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")

#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")

#else

#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()

#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()

#endif


#if defined(ANKERL_NANOBENCH_LOG_ENABLED)

#    include <iostream>

#    define ANKERL_NANOBENCH_LOG(x)                                                 \

        do {                                                                        \

            std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \

        } while (0)

#else

#    define ANKERL_NANOBENCH_LOG(x) \

        do {                        \

        } while (0)

#endif


#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0

#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)

#    include <linux/version.h>

#    if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)

// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3

// PERF_FLAG_FD_CLOEXEC since kernel 3.14

#        undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS

#        define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1

#    endif

#endif


#if defined(__clang__)

#    define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))

#else

#    define ANKERL_NANOBENCH_NO_SANITIZE(...)

#endif


#if defined(_MSC_VER)

#    define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)

#else

#    define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))

#endif


// workaround missing "is_trivially_copyable" in g++ < 5.0

// See https://stackoverflow.com/a/31798726/48181

#if defined(__GNUC__) && __GNUC__ < 5

#    define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)

#else

#    define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value

#endif


// noexcept may be missing for std::string.

// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58265

#define ANKERL_NANOBENCH_PRIVATE_NOEXCEPT_STRING_MOVE() std::is_nothrow_move_assignable<std::string>::value


// declarations ///////////////////////////////////////////////////////////////////////////////////


namespace ankerl {

namespace nanobench {


using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,

                               std::chrono::steady_clock>::type;

class Bench;

struct Config;

class Result;

class Rng;

class BigO;


void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);

void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);


void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);

void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);


// Contains mustache-like templates

namespace templates {


char const* csv() noexcept;


char const* htmlBoxplot() noexcept;


char const* pyperf() noexcept;


char const* json() noexcept;


} // namespace templates


namespace detail {


template <typename T>

struct PerfCountSet;


class IterationLogic;

class PerformanceCounters;


#if ANKERL_NANOBENCH(PERF_COUNTERS)

class LinuxPerformanceCounters;

#endif


} // namespace detail

} // namespace nanobench

} // namespace ankerl


// definitions ////////////////////////////////////////////////////////////////////////////////////


namespace ankerl {

namespace nanobench {

namespace detail {


template <typename T>

struct PerfCountSet {

    T pageFaults{};

    T cpuCycles{};

    T contextSwitches{};

    T instructions{};

    T branchInstructions{};

    T branchMisses{};

};


} // namespace detail


ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)

struct Config {

    // actual benchmark config

    std::string mBenchmarkTitle = "benchmark";                               // NOLINT(misc-non-private-member-variables-in-classes)

    std::string mBenchmarkName = "noname";                                   // NOLINT(misc-non-private-member-variables-in-classes)

    std::string mUnit = "op";                                                // NOLINT(misc-non-private-member-variables-in-classes)

    double mBatch = 1.0;                                                     // NOLINT(misc-non-private-member-variables-in-classes)

    double mComplexityN = -1.0;                                              // NOLINT(misc-non-private-member-variables-in-classes)

    size_t mNumEpochs = 11;                                                  // NOLINT(misc-non-private-member-variables-in-classes)

    size_t mClockResolutionMultiple = static_cast<size_t>(1000);             // NOLINT(misc-non-private-member-variables-in-classes)

    std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100); // NOLINT(misc-non-private-member-variables-in-classes)

    std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1);   // NOLINT(misc-non-private-member-variables-in-classes)

    uint64_t mMinEpochIterations{1};                                         // NOLINT(misc-non-private-member-variables-in-classes)

    // If not 0, run *exactly* these number of iterations per epoch.

    uint64_t mEpochIterations{0};                                          // NOLINT(misc-non-private-member-variables-in-classes)

    uint64_t mWarmup = 0;                                                  // NOLINT(misc-non-private-member-variables-in-classes)

    std::ostream* mOut = nullptr;                                          // NOLINT(misc-non-private-member-variables-in-classes)

    std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1}; // NOLINT(misc-non-private-member-variables-in-classes)

    std::string mTimeUnitName = "ns";                                      // NOLINT(misc-non-private-member-variables-in-classes)

    bool mShowPerformanceCounters = true;                                  // NOLINT(misc-non-private-member-variables-in-classes)

    bool mIsRelative = false;                                              // NOLINT(misc-non-private-member-variables-in-classes)

    std::unordered_map<std::string, std::string> mContext{};               // NOLINT(misc-non-private-member-variables-in-classes)


    Config();

    ~Config();

    Config& operator=(Config const& other);

    Config& operator=(Config&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));

    Config(Config const& other);

    Config(Config&& other) noexcept;

};

ANKERL_NANOBENCH(IGNORE_PADDED_POP)


// Result returned after a benchmark has finished. Can be used as a baseline for relative().

ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)

class Result {

public:

    enum class Measure : size_t {

        elapsed,

        iterations,

        pagefaults,

        cpucycles,

        contextswitches,

        instructions,

        branchinstructions,

        branchmisses,

        _size

    };


    explicit Result(Config benchmarkConfig);


    ~Result();

    Result& operator=(Result const& other);

    Result& operator=(Result&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));

    Result(Result const& other);

    Result(Result&& other) noexcept;


    // adds new measurement results

    // all values are scaled by iters (except iters...)

    void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);


    ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;


    ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;

    ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;

    ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;

    ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;

    ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;

    ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;

    ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;

    ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const* variableName) const;

    ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const& variableName) const;


    ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;

    ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;

    ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;

    ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;


    // Finds string, if not found, returns _size.

    static Measure fromString(std::string const& str);


private:

    Config mConfig{};

    std::vector<std::vector<double>> mNameToMeasurements{};

};

ANKERL_NANOBENCH(IGNORE_PADDED_POP)


class Rng final {

public:

    using result_type = uint64_t;


    static constexpr uint64_t(min)();

    static constexpr uint64_t(max)();


    Rng(Rng const&) = delete;


    Rng& operator=(Rng const&) = delete;


    // moving is ok

    Rng(Rng&&) noexcept = default;

    Rng& operator=(Rng&&) noexcept = default;

    ~Rng() noexcept = default;


    Rng();


    explicit Rng(uint64_t seed) noexcept;

    Rng(uint64_t x, uint64_t y) noexcept;

    explicit Rng(std::vector<uint64_t> const& data);


    ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;


    inline uint64_t operator()() noexcept;


    // This is slightly biased. See


    inline uint32_t bounded(uint32_t range) noexcept;


    // random double in range [0, 1(

    // see http://prng.di.unimi.it/


    inline double uniform01() noexcept;


    template <typename Container>

    void shuffle(Container& container) noexcept;


    ANKERL_NANOBENCH(NODISCARD) std::vector<uint64_t> state() const;


private:

    static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;


    uint64_t mX;

    uint64_t mY;

};


ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)

class Bench {

public:

    Bench();


    Bench(Bench&& other) noexcept;

    Bench& operator=(Bench&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));

    Bench(Bench const& other);

    Bench& operator=(Bench const& other);

    ~Bench() noexcept;


    template <typename Op>

    ANKERL_NANOBENCH(NOINLINE)

    Bench& run(char const* benchmarkName, Op&& op);


    template <typename Op>

    ANKERL_NANOBENCH(NOINLINE)

    Bench& run(std::string const& benchmarkName, Op&& op);


    template <typename Op>

    ANKERL_NANOBENCH(NOINLINE)

    Bench& run(Op&& op);


    Bench& title(char const* benchmarkTitle);

    Bench& title(std::string const& benchmarkTitle);


    ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;


    Bench& name(char const* benchmarkName);

    Bench& name(std::string const& benchmarkName);

    ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;


    Bench& context(char const* variableName, char const* variableValue);

    Bench& context(std::string const& variableName, std::string const& variableValue);


    Bench& clearContext();


    template <typename T>

    Bench& batch(T b) noexcept;

    ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;


    Bench& unit(char const* unit);

    Bench& unit(std::string const& unit);

    ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;


    Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);

    ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;

    ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;


    Bench& output(std::ostream* outstream) noexcept;

    ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;


    Bench& clockResolutionMultiple(size_t multiple) noexcept;

    ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;


    Bench& epochs(size_t numEpochs) noexcept;

    ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;


    Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;

    ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;


    Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;

    ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;


    Bench& minEpochIterations(uint64_t numIters) noexcept;

    ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;


    Bench& epochIterations(uint64_t numIters) noexcept;

    ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;


    Bench& warmup(uint64_t numWarmupIters) noexcept;

    ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;


    Bench& relative(bool isRelativeEnabled) noexcept;

    ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;


    Bench& performanceCounters(bool showPerformanceCounters) noexcept;

    ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;


    ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;


    template <typename Arg>

    Bench& doNotOptimizeAway(Arg&& arg);


    template <typename T>

    Bench& complexityN(T n) noexcept;

    ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;


    std::vector<BigO> complexityBigO() const;


    template <typename Op>

    BigO complexityBigO(char const* name, Op op) const;


    template <typename Op>

    BigO complexityBigO(std::string const& name, Op op) const;


    Bench& render(char const* templateContent, std::ostream& os);

    Bench& render(std::string const& templateContent, std::ostream& os);


    Bench& config(Config const& benchmarkConfig);

    ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;


private:

    Config mConfig{};

    std::vector<Result> mResults{};

};

ANKERL_NANOBENCH(IGNORE_PADDED_POP)


template <typename Arg>

void doNotOptimizeAway(Arg&& arg);


namespace detail {


#if defined(_MSC_VER)

void doNotOptimizeAwaySink(void const*);


template <typename T>

void doNotOptimizeAway(T const& val);


#else


// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but

// this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.

// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L443-L446

template <typename T>

void doNotOptimizeAway(T const& val) {

    // NOLINTNEXTLINE(hicpp-no-assembler)

    asm volatile("" : : "r,m"(val) : "memory");

}


template <typename T>

void doNotOptimizeAway(T& val) {

#    if defined(__clang__)

    // NOLINTNEXTLINE(hicpp-no-assembler)

    asm volatile("" : "+r,m"(val) : : "memory");

#    else

    // NOLINTNEXTLINE(hicpp-no-assembler)

    asm volatile("" : "+m,r"(val) : : "memory");

#    endif

}

#endif


// internally used, but visible because run() is templated.

// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from

// having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.

ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)

class IterationLogic {

public:

    explicit IterationLogic(Bench const& bench);

    IterationLogic(IterationLogic&&) = delete;

    IterationLogic& operator=(IterationLogic&&) = delete;

    IterationLogic(IterationLogic const&) = delete;

    IterationLogic& operator=(IterationLogic const&) = delete;

    ~IterationLogic();


    ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;

    void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;

    void moveResultTo(std::vector<Result>& results) noexcept;


private:

    struct Impl;

    Impl* mPimpl;

};

ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)


ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)

class PerformanceCounters {

public:

    PerformanceCounters(PerformanceCounters const&) = delete;

    PerformanceCounters(PerformanceCounters&&) = delete;

    PerformanceCounters& operator=(PerformanceCounters const&) = delete;

    PerformanceCounters& operator=(PerformanceCounters&&) = delete;


    PerformanceCounters();

    ~PerformanceCounters();


    void beginMeasure();

    void endMeasure();

    void updateResults(uint64_t numIters);


    ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;

    ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;


private:

#if ANKERL_NANOBENCH(PERF_COUNTERS)

    LinuxPerformanceCounters* mPc = nullptr;

#endif

    PerfCountSet<uint64_t> mVal{};

    PerfCountSet<bool> mHas{};

};

ANKERL_NANOBENCH(IGNORE_PADDED_POP)


// Gets the singleton

PerformanceCounters& performanceCounters();


} // namespace detail


class BigO {

public:

    using RangeMeasure = std::vector<std::pair<double, double>>;


    template <typename Op>

    static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op) {

        for (auto& rangeMeasure : data) {

            rangeMeasure.first = op(rangeMeasure.first);

        }

        return data;

    }


    static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);


    template <typename Op>

    BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)

        : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}


    template <typename Op>

    BigO(std::string bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)

        : BigO(std::move(bigOName), mapRangeMeasure(rangeMeasure, rangeToN)) {}


    BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);

    BigO(std::string bigOName, RangeMeasure const& scaledRangeMeasure);

    ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;

    ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;

    ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;

    ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;


private:

    std::string mName{};

    double mConstant{};

    double mNormalizedRootMeanSquare{};

};

std::ostream& operator<<(std::ostream& os, BigO const& bigO);

std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);


} // namespace nanobench

} // namespace ankerl


// implementation /////////////////////////////////////////////////////////////////////////////////


namespace ankerl {

namespace nanobench {


constexpr uint64_t(Rng::min)() {

    return 0;

}


constexpr uint64_t(Rng::max)() {

    return (std::numeric_limits<uint64_t>::max)();

}


ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")

uint64_t Rng::operator()() noexcept {

    auto x = mX;


    mX = UINT64_C(15241094284759029579) * mY;

    mY = rotl(mY - x, 27);


    return x;

}


ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")

uint32_t Rng::bounded(uint32_t range) noexcept {

    uint64_t const r32 = static_cast<uint32_t>(operator()());

    auto multiresult = r32 * range;

    return static_cast<uint32_t>(multiresult >> 32U);

}


double Rng::uniform01() noexcept {

    auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);

    // can't use union in c++ here for type puning, it's undefined behavior.

    // std::memcpy is optimized anyways.

    double d{};

    std::memcpy(&d, &i, sizeof(double));

    return d - 1.0;

}


template <typename Container>

void Rng::shuffle(Container& container) noexcept {

    auto i = container.size();

    while (i > 1U) {

        using std::swap;

        auto n = operator()();

        // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80)

        auto b1 = static_cast<decltype(i)>((static_cast<uint32_t>(n) * static_cast<uint64_t>(i)) >> 32U);

        swap(container[--i], container[b1]);


        auto b2 = static_cast<decltype(i)>(((n >> 32U) * static_cast<uint64_t>(i)) >> 32U);

        swap(container[--i], container[b2]);

    }

}


ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")

constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {

    return (x << k) | (x >> (64U - k));

}


template <typename Op>

ANKERL_NANOBENCH_NO_SANITIZE("integer")

Bench& Bench::run(Op&& op) {

    // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()

    detail::IterationLogic iterationLogic(*this);

    auto& pc = detail::performanceCounters();


    while (auto n = iterationLogic.numIters()) {

        pc.beginMeasure();

        Clock::time_point const before = Clock::now();

        while (n-- > 0) {

            op();

        }

        Clock::time_point const after = Clock::now();

        pc.endMeasure();

        pc.updateResults(iterationLogic.numIters());

        iterationLogic.add(after - before, pc);

    }

    iterationLogic.moveResultTo(mResults);

    return *this;

}


// Performs all evaluations.

template <typename Op>

Bench& Bench::run(char const* benchmarkName, Op&& op) {

    name(benchmarkName);

    return run(std::forward<Op>(op));

}


template <typename Op>

Bench& Bench::run(std::string const& benchmarkName, Op&& op) {

    name(benchmarkName);

    return run(std::forward<Op>(op));

}


template <typename Op>

BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {

    return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);

}


template <typename Op>

BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {

    return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);

}


// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.

// Any argument is cast to double.

template <typename T>

Bench& Bench::batch(T b) noexcept {

    mConfig.mBatch = static_cast<double>(b);

    return *this;

}


// Sets the computation complexity of the next run. Any argument is cast to double.

template <typename T>

Bench& Bench::complexityN(T n) noexcept {

    mConfig.mComplexityN = static_cast<double>(n);

    return *this;

}


// Convenience: makes sure none of the given arguments are optimized away by the compiler.

template <typename Arg>

Bench& Bench::doNotOptimizeAway(Arg&& arg) {

    detail::doNotOptimizeAway(std::forward<Arg>(arg));

    return *this;

}


// Makes sure none of the given arguments are optimized away by the compiler.

template <typename Arg>

void doNotOptimizeAway(Arg&& arg) {

    detail::doNotOptimizeAway(std::forward<Arg>(arg));

}


namespace detail {


#if defined(_MSC_VER)

template <typename T>

void doNotOptimizeAway(T const& val) {

    doNotOptimizeAwaySink(&val);

}


#endif


} // namespace detail

} // namespace nanobench

} // namespace ankerl


#if defined(ANKERL_NANOBENCH_IMPLEMENT)


// implementation part - only visible in .cpp


#    include <algorithm> // sort, reverse

#    include <atomic>    // compare_exchange_strong in loop overhead

#    include <cstdlib>   // getenv

#    include <cstring>   // strstr, strncmp

#    include <fstream>   // ifstream to parse proc files

#    include <iomanip>   // setw, setprecision

#    include <iostream>  // cout

#    include <numeric>   // accumulate

#    include <random>    // random_device

#    include <sstream>   // to_s in Number

#    include <stdexcept> // throw for rendering templates

#    include <tuple>     // std::tie

#    if defined(__linux__)

#        include <unistd.h> //sysconf

#    endif

#    if ANKERL_NANOBENCH(PERF_COUNTERS)

#        include <map> // map


#        include <linux/perf_event.h>

#        include <sys/ioctl.h>

#        include <sys/syscall.h>

#    endif


// declarations ///////////////////////////////////////////////////////////////////////////////////


namespace ankerl {

namespace nanobench {


// helper stuff that is only intended to be used internally

namespace detail {


struct TableInfo;


// formatting utilities

namespace fmt {


class NumSep;

class StreamStateRestorer;

class Number;

class MarkDownColumn;

class MarkDownCode;


} // namespace fmt

} // namespace detail

} // namespace nanobench

} // namespace ankerl


// definitions ////////////////////////////////////////////////////////////////////////////////////


namespace ankerl {

namespace nanobench {


uint64_t splitMix64(uint64_t& state) noexcept;


namespace detail {


// helpers to get double values

template <typename T>

inline double d(T t) noexcept {

    return static_cast<double>(t);

}

inline double d(Clock::duration duration) noexcept {

    return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();

}


// Calculates clock resolution once, and remembers the result

inline Clock::duration clockResolution() noexcept;


} // namespace detail


namespace templates {


char const* csv() noexcept {

    return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"

{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}

{{/result}})DELIM";

}


char const* htmlBoxplot() noexcept {

    return R"DELIM(<html>


<head>

    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>

</head>


<body>

    <div id="myDiv"></div>

    <script>

        var data = [

            {{#result}}{

                name: '{{name}}',

                y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],

            },

            {{/result}}

        ];

        var title = '{{title}}';


        data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));

        var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});

    </script>

</body>


</html>)DELIM";

}


char const* pyperf() noexcept {

    return R"DELIM({

    "benchmarks": [

        {

            "runs": [

                {

                    "values": [

{{#measurement}}                        {{elapsed}}{{^-last}},

{{/last}}{{/measurement}}

                    ]

                }

            ]

        }

    ],

    "metadata": {

        "loops": {{sum(iterations)}},

        "inner_loops": {{batch}},

        "name": "{{title}}",

        "unit": "second"

    },

    "version": "1.0"

})DELIM";

}


char const* json() noexcept {

    return R"DELIM({

    "results": [

{{#result}}        {

            "title": "{{title}}",

            "name": "{{name}}",

            "unit": "{{unit}}",

            "batch": {{batch}},

            "complexityN": {{complexityN}},

            "epochs": {{epochs}},

            "clockResolution": {{clockResolution}},

            "clockResolutionMultiple": {{clockResolutionMultiple}},

            "maxEpochTime": {{maxEpochTime}},

            "minEpochTime": {{minEpochTime}},

            "minEpochIterations": {{minEpochIterations}},

            "epochIterations": {{epochIterations}},

            "warmup": {{warmup}},

            "relative": {{relative}},

            "median(elapsed)": {{median(elapsed)}},

            "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},

            "median(instructions)": {{median(instructions)}},

            "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},

            "median(cpucycles)": {{median(cpucycles)}},

            "median(contextswitches)": {{median(contextswitches)}},

            "median(pagefaults)": {{median(pagefaults)}},

            "median(branchinstructions)": {{median(branchinstructions)}},

            "median(branchmisses)": {{median(branchmisses)}},

            "totalTime": {{sumProduct(iterations, elapsed)}},

            "measurements": [

{{#measurement}}                {

                    "iterations": {{iterations}},

                    "elapsed": {{elapsed}},

                    "pagefaults": {{pagefaults}},

                    "cpucycles": {{cpucycles}},

                    "contextswitches": {{contextswitches}},

                    "instructions": {{instructions}},

                    "branchinstructions": {{branchinstructions}},

                    "branchmisses": {{branchmisses}}

                }{{^-last}},{{/-last}}

{{/measurement}}            ]

        }{{^-last}},{{/-last}}

{{/result}}    ]

})DELIM";

}


ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)

struct Node {

    enum class Type { tag, content, section, inverted_section };


    char const* begin;

    char const* end;

    std::vector<Node> children;

    Type type;


    template <size_t N>

    // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)

    bool operator==(char const (&str)[N]) const noexcept {

        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)

        return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);

    }

};

ANKERL_NANOBENCH(IGNORE_PADDED_POP)


// NOLINTNEXTLINE(misc-no-recursion)

static std::vector<Node> parseMustacheTemplate(char const** tpl) {

    std::vector<Node> nodes;


    while (true) {

        auto const* begin = std::strstr(*tpl, "{{");

        auto const* end = begin;

        if (begin != nullptr) {

            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

            begin += 2;

            end = std::strstr(begin, "}}");

        }


        if (begin == nullptr || end == nullptr) {

            // nothing found, finish node

            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

            nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});

            return nodes;

        }


        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

        nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});


        // we found a tag

        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

        *tpl = end + 2;

        switch (*begin) {

        case '/':

            // finished! bail out

            return nodes;


        case '#':

            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

            nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});

            break;


        case '^':

            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

            nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});

            break;


        default:

            nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});

            break;

        }

    }

}


static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {

    ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));

    bool const matchFirst = n == "-first";

    bool const matchLast = n == "-last";

    if (!matchFirst && !matchLast) {

        return false;

    }


    bool doWrite = false;

    if (n.type == Node::Type::section) {

        doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);

    } else if (n.type == Node::Type::inverted_section) {

        doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);

    }


    if (doWrite) {

        for (auto const& child : n.children) {

            if (child.type == Node::Type::content) {

                out.write(child.begin, std::distance(child.begin, child.end));

            }

        }

    }

    return true;

}


static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {

    matchResult.clear();

    auto idxOpen = str.find('(');

    auto idxClose = str.find(')', idxOpen);

    if (idxClose == std::string::npos) {

        return false;

    }


    matchResult.emplace_back(str.substr(0, idxOpen));


    // split by comma

    matchResult.emplace_back();

    for (size_t i = idxOpen + 1; i != idxClose; ++i) {

        if (str[i] == ' ' || str[i] == '\t') {

            // skip whitespace

            continue;

        }

        if (str[i] == ',') {

            // got a comma => new string

            matchResult.emplace_back();

            continue;

        }

        // no whitespace no comma, append

        matchResult.back() += str[i];

    }

    return true;

}


static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {

    using detail::d;


    if (n == "title") {

        out << config.mBenchmarkTitle;

        return true;

    }

    if (n == "name") {

        out << config.mBenchmarkName;

        return true;

    }

    if (n == "unit") {

        out << config.mUnit;

        return true;

    }

    if (n == "batch") {

        out << config.mBatch;

        return true;

    }

    if (n == "complexityN") {

        out << config.mComplexityN;

        return true;

    }

    if (n == "epochs") {

        out << config.mNumEpochs;

        return true;

    }

    if (n == "clockResolution") {

        out << d(detail::clockResolution());

        return true;

    }

    if (n == "clockResolutionMultiple") {

        out << config.mClockResolutionMultiple;

        return true;

    }

    if (n == "maxEpochTime") {

        out << d(config.mMaxEpochTime);

        return true;

    }

    if (n == "minEpochTime") {

        out << d(config.mMinEpochTime);

        return true;

    }

    if (n == "minEpochIterations") {

        out << config.mMinEpochIterations;

        return true;

    }

    if (n == "epochIterations") {

        out << config.mEpochIterations;

        return true;

    }

    if (n == "warmup") {

        out << config.mWarmup;

        return true;

    }

    if (n == "relative") {

        out << config.mIsRelative;

        return true;

    }

    return false;

}


// NOLINTNEXTLINE(readability-function-cognitive-complexity)

static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {

    if (generateConfigTag(n, r.config(), out)) {

        return out;

    }

    // match e.g. "median(elapsed)"

    // g++ 4.8 doesn't implement std::regex :(

    // static std::regex const regOpArg1("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\‍)$");

    // std::cmatch matchResult;

    // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {

    std::vector<std::string> matchResult;

    if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {

        if (matchResult.size() == 2) {

            if (matchResult[0] == "context") {

                return out << r.context(matchResult[1]);

            }


            auto m = Result::fromString(matchResult[1]);

            if (m == Result::Measure::_size) {

                return out << 0.0;

            }


            if (matchResult[0] == "median") {

                return out << r.median(m);

            }

            if (matchResult[0] == "average") {

                return out << r.average(m);

            }

            if (matchResult[0] == "medianAbsolutePercentError") {

                return out << r.medianAbsolutePercentError(m);

            }

            if (matchResult[0] == "sum") {

                return out << r.sum(m);

            }

            if (matchResult[0] == "minimum") {

                return out << r.minimum(m);

            }

            if (matchResult[0] == "maximum") {

                return out << r.maximum(m);

            }

        } else if (matchResult.size() == 3) {

            auto m1 = Result::fromString(matchResult[1]);

            auto m2 = Result::fromString(matchResult[2]);

            if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) {

                return out << 0.0;

            }


            if (matchResult[0] == "sumProduct") {

                return out << r.sumProduct(m1, m2);

            }

        }

    }


    // match e.g. "sumProduct(elapsed, iterations)"

    // static std::regex const regOpArg2("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\‍)$");


    // nothing matches :(

    throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");

}


static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {

    for (auto const& n : nodes) {

        if (!generateFirstLast(n, idx, r.size(), out)) {

            ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));

            switch (n.type) {

            case Node::Type::content:

                out.write(n.begin, std::distance(n.begin, n.end));

                break;


            case Node::Type::inverted_section:

                throw std::runtime_error("got a inverted section inside measurement");


            case Node::Type::section:

                throw std::runtime_error("got a section inside measurement");


            case Node::Type::tag: {

                auto m = Result::fromString(std::string(n.begin, n.end));

                if (m == Result::Measure::_size || !r.has(m)) {

                    out << 0.0;

                } else {

                    out << r.get(idx, m);

                }

                break;

            }

            }

        }

    }

}


static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {

    auto const& r = results[idx];

    for (auto const& n : nodes) {

        if (!generateFirstLast(n, idx, results.size(), out)) {

            ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));

            switch (n.type) {

            case Node::Type::content:

                out.write(n.begin, std::distance(n.begin, n.end));

                break;


            case Node::Type::inverted_section:

                throw std::runtime_error("got a inverted section inside result");


            case Node::Type::section:

                if (n == "measurement") {

                    for (size_t i = 0; i < r.size(); ++i) {

                        generateResultMeasurement(n.children, i, r, out);

                    }

                } else {

                    throw std::runtime_error("got a section inside result");

                }

                break;


            case Node::Type::tag:

                generateResultTag(n, r, out);

                break;

            }

        }

    }

}


} // namespace templates


// helper stuff that only intended to be used internally

namespace detail {


char const* getEnv(char const* name);

bool isEndlessRunning(std::string const& name);

bool isWarningsEnabled();


template <typename T>

T parseFile(std::string const& filename, bool* fail);


void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);

void printStabilityInformationOnce(std::ostream* outStream);


// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.

uint64_t& singletonHeaderHash() noexcept;


// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.

Clock::duration calcClockResolution(size_t numEvaluations) noexcept;


// formatting utilities

namespace fmt {


// adds thousands separator to numbers

ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)

class NumSep : public std::numpunct<char> {

public:

    explicit NumSep(char sep);

    char do_thousands_sep() const override;

    std::string do_grouping() const override;


private:

    char mSep;

};

ANKERL_NANOBENCH(IGNORE_PADDED_POP)


// RAII to save & restore a stream's state

ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)

class StreamStateRestorer {

public:

    explicit StreamStateRestorer(std::ostream& s);

    ~StreamStateRestorer();


    // sets back all stream info that we remembered at construction

    void restore();


    // don't allow copying / moving

    StreamStateRestorer(StreamStateRestorer const&) = delete;

    StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;

    StreamStateRestorer(StreamStateRestorer&&) = delete;

    StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;


private:

    std::ostream& mStream;

    std::locale mLocale;

    std::streamsize const mPrecision;

    std::streamsize const mWidth;

    std::ostream::char_type const mFill;

    std::ostream::fmtflags const mFmtFlags;

};

ANKERL_NANOBENCH(IGNORE_PADDED_POP)


// Number formatter

class Number {

public:

    Number(int width, int precision, double value);

    Number(int width, int precision, int64_t value);

    ANKERL_NANOBENCH(NODISCARD) std::string to_s() const;


private:

    friend std::ostream& operator<<(std::ostream& os, Number const& n);

    std::ostream& write(std::ostream& os) const;


    int mWidth;

    int mPrecision;

    double mValue;

};


// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent

std::string to_s(uint64_t n);


std::ostream& operator<<(std::ostream& os, Number const& n);


class MarkDownColumn {

public:

    MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept;

    ANKERL_NANOBENCH(NODISCARD) std::string title() const;

    ANKERL_NANOBENCH(NODISCARD) std::string separator() const;

    ANKERL_NANOBENCH(NODISCARD) std::string invalid() const;

    ANKERL_NANOBENCH(NODISCARD) std::string value() const;


private:

    int mWidth;

    int mPrecision;

    std::string mTitle;

    std::string mSuffix;

    double mValue;

};


// Formats any text as markdown code, escaping backticks.

class MarkDownCode {

public:

    explicit MarkDownCode(std::string const& what);


private:

    friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);

    std::ostream& write(std::ostream& os) const;


    std::string mWhat{};

};


std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);


} // namespace fmt

} // namespace detail

} // namespace nanobench

} // namespace ankerl


// implementation /////////////////////////////////////////////////////////////////////////////////


namespace ankerl {

namespace nanobench {


// NOLINTNEXTLINE(readability-function-cognitive-complexity)

void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {

    detail::fmt::StreamStateRestorer const restorer(out);


    out.precision(std::numeric_limits<double>::digits10);

    auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);


    for (auto const& n : nodes) {

        ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));

        switch (n.type) {

        case templates::Node::Type::content:

            out.write(n.begin, std::distance(n.begin, n.end));

            break;


        case templates::Node::Type::inverted_section:

            throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");


        case templates::Node::Type::section:

            if (n == "result") {

                const size_t nbResults = results.size();

                for (size_t i = 0; i < nbResults; ++i) {

                    generateResult(n.children, i, results, out);

                }

            } else if (n == "measurement") {

                if (results.size() != 1) {

                    throw std::runtime_error(

                        "render: can only use section 'measurement' here if there is a single result, but there are " +

                        detail::fmt::to_s(results.size()));

                }

                // when we only have a single result, we can immediately go into its measurement.

                auto const& r = results.front();

                for (size_t i = 0; i < r.size(); ++i) {

                    generateResultMeasurement(n.children, i, r, out);

                }

            } else {

                throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");

            }

            break;


        case templates::Node::Type::tag:

            if (results.size() == 1) {

                // result & config are both supported there

                generateResultTag(n, results.front(), out);

            } else {

                // This just uses the last result's config.

                if (!generateConfigTag(n, results.back().config(), out)) {

                    throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");

                }

            }

            break;

        }

    }

}


void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {

    render(mustacheTemplate.c_str(), results, out);

}


void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {

    render(mustacheTemplate, bench.results(), out);

}


void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {

    render(mustacheTemplate.c_str(), bench.results(), out);

}


namespace detail {


PerformanceCounters& performanceCounters() {

#    if defined(__clang__)

#        pragma clang diagnostic push

#        pragma clang diagnostic ignored "-Wexit-time-destructors"

#    endif

    static PerformanceCounters pc;

#    if defined(__clang__)

#        pragma clang diagnostic pop

#    endif

    return pc;

}


// Windows version of doNotOptimizeAway

// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L514

// see https://github.com/facebook/folly/blob/v2023.01.30.00/folly/lang/Hint-inl.h#L54-L58

// see https://learn.microsoft.com/en-us/cpp/preprocessor/optimize

#    if defined(_MSC_VER)

#        pragma optimize("", off)

void doNotOptimizeAwaySink(void const*) {}

#        pragma optimize("", on)

#    endif


template <typename T>

T parseFile(std::string const& filename, bool* fail) {

    std::ifstream fin(filename); // NOLINT(misc-const-correctness)

    T num{};

    fin >> num;

    if (fail != nullptr) {

        *fail = fin.fail();

    }

    return num;

}


char const* getEnv(char const* name) {

#    if defined(_MSC_VER)

#        pragma warning(push)

#        pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.

#    endif

    return std::getenv(name); // NOLINT(concurrency-mt-unsafe)

#    if defined(_MSC_VER)

#        pragma warning(pop)

#    endif

}


bool isEndlessRunning(std::string const& name) {

    auto const* const endless = getEnv("NANOBENCH_ENDLESS");

    return nullptr != endless && endless == name;

}


// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"

bool isWarningsEnabled() {

    auto const* const suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");

    return nullptr == suppression || suppression == std::string("0");

}


void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {

    warnings.clear();

    recommendations.clear();


#    if defined(DEBUG)

    warnings.emplace_back("DEBUG defined");

    bool const recommendCheckFlags = true;

#    else

    bool const recommendCheckFlags = false;

#    endif


    bool recommendPyPerf = false;

#    if defined(__linux__)

    auto nprocs = sysconf(_SC_NPROCESSORS_CONF);

    if (nprocs <= 0) {

        warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");

    } else {

        // check frequency scaling

        for (long id = 0; id < nprocs; ++id) {

            auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));

            auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;

            auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq", nullptr);

            auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq", nullptr);

            if (minFreq != maxFreq) {

                auto minMHz = d(minFreq) / 1000.0;

                auto maxMHz = d(maxFreq) / 1000.0;

                warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +

                                      detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +

                                      " MHz");

                recommendPyPerf = true;

                break;

            }

        }


        auto fail = false;

        auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", &fail);

        if (!fail && "performance" != currentGovernor) {

            warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");

            recommendPyPerf = true;

        }


        auto noTurbo = parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo", &fail);

        if (!fail && noTurbo == 0) {

            warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");

            recommendPyPerf = true;

        }

    }

#    endif


    if (recommendCheckFlags) {

        recommendations.emplace_back("Make sure you compile for Release");

    }

    if (recommendPyPerf) {

        recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");

    }

}


void printStabilityInformationOnce(std::ostream* outStream) {

    static bool shouldPrint = true;

    if (shouldPrint && (nullptr != outStream) && isWarningsEnabled()) {

        auto& os = *outStream;

        shouldPrint = false;

        std::vector<std::string> warnings;

        std::vector<std::string> recommendations;

        gatherStabilityInformation(warnings, recommendations);

        if (warnings.empty()) {

            return;

        }


        os << "Warning, results might be unstable:" << std::endl;

        for (auto const& w : warnings) {

            os << "* " << w << std::endl;

        }


        os << std::endl << "Recommendations" << std::endl;

        for (auto const& r : recommendations) {

            os << "* " << r << std::endl;

        }

    }

}


// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.

uint64_t& singletonHeaderHash() noexcept {

    static uint64_t sHeaderHash{};

    return sHeaderHash;

}


ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")

inline uint64_t hash_combine(uint64_t seed, uint64_t val) {

    return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));

}


// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.

Clock::duration calcClockResolution(size_t numEvaluations) noexcept {

    auto bestDuration = Clock::duration::max();

    Clock::time_point tBegin;

    Clock::time_point tEnd;

    for (size_t i = 0; i < numEvaluations; ++i) {

        tBegin = Clock::now();

        do {

            tEnd = Clock::now();

        } while (tBegin == tEnd);

        bestDuration = (std::min)(bestDuration, tEnd - tBegin);

    }

    return bestDuration;

}


// Calculates clock resolution once, and remembers the result

Clock::duration clockResolution() noexcept {

    static Clock::duration const sResolution = calcClockResolution(20);

    return sResolution;

}


ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)

struct IterationLogic::Impl {

    enum class State { warmup, upscaling_runtime, measuring, endless };


    explicit Impl(Bench const& bench)

        : mBench(bench)

        , mResult(bench.config()) {

        printStabilityInformationOnce(mBench.output());


        // determine target runtime per epoch

        mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();

        if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {

            mTargetRuntimePerEpoch = mBench.maxEpochTime();

        }

        if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {

            mTargetRuntimePerEpoch = mBench.minEpochTime();

        }


        if (isEndlessRunning(mBench.name())) {

            std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;

            mNumIters = (std::numeric_limits<uint64_t>::max)();

            mState = State::endless;

        } else if (0 != mBench.warmup()) {

            mNumIters = mBench.warmup();

            mState = State::warmup;

        } else if (0 != mBench.epochIterations()) {

            // exact number of iterations

            mNumIters = mBench.epochIterations();

            mState = State::measuring;

        } else {

            mNumIters = mBench.minEpochIterations();

            mState = State::upscaling_runtime;

        }

    }


    // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.

    ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {

        auto doubleElapsed = d(elapsed);

        auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);

        auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);


        auto doubleMinEpochIters = d(mBench.minEpochIterations());

        if (doubleNewIters < doubleMinEpochIters) {

            doubleNewIters = doubleMinEpochIters;

        }

        doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();


        // +0.5 for correct rounding when casting

        // NOLINTNEXTLINE(bugprone-incorrect-roundings)

        return static_cast<uint64_t>(doubleNewIters + 0.5);

    }


    ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {

        if (elapsed * 10 < mTargetRuntimePerEpoch) {

            // we are far below the target runtime. Multiply iterations by 10 (with overflow check)

            if (mNumIters * 10 < mNumIters) {

                // overflow :-(

                showResult("iterations overflow. Maybe your code got optimized away?");

                mNumIters = 0;

                return;

            }

            mNumIters *= 10;

        } else {

            mNumIters = calcBestNumIters(elapsed, mNumIters);

        }

    }


    void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {

#    if defined(ANKERL_NANOBENCH_LOG_ENABLED)

        auto oldIters = mNumIters;

#    endif


        switch (mState) {

        case State::warmup:

            if (isCloseEnoughForMeasurements(elapsed)) {

                // if elapsed is close enough, we can skip upscaling and go right to measurements

                // still, we don't add the result to the measurements.

                mState = State::measuring;

                mNumIters = calcBestNumIters(elapsed, mNumIters);

            } else {

                // not close enough: switch to upscaling

                mState = State::upscaling_runtime;

                upscale(elapsed);

            }

            break;


        case State::upscaling_runtime:

            if (isCloseEnoughForMeasurements(elapsed)) {

                // if we are close enough, add measurement and switch to always measuring

                mState = State::measuring;

                mTotalElapsed += elapsed;

                mTotalNumIters += mNumIters;

                mResult.add(elapsed, mNumIters, pc);

                mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);

            } else {

                upscale(elapsed);

            }

            break;


        case State::measuring:

            // just add measurements - no questions asked. Even when runtime is low. But we can't ignore

            // that fluctuation, or else we would bias the result

            mTotalElapsed += elapsed;

            mTotalNumIters += mNumIters;

            mResult.add(elapsed, mNumIters, pc);

            if (0 != mBench.epochIterations()) {

                mNumIters = mBench.epochIterations();

            } else {

                mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);

            }

            break;


        case State::endless:

            mNumIters = (std::numeric_limits<uint64_t>::max)();

            break;

        }


        if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {

            // we got all the results that we need, finish it

            showResult("");

            mNumIters = 0;

        }


        ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, d(elapsed.count())) << " elapsed, "

                                           << detail::fmt::Number(20, 3, d(mTargetRuntimePerEpoch.count())) << " target. oldIters="

                                           << oldIters << ", mNumIters=" << mNumIters << ", mState=" << static_cast<int>(mState));

    }


    // NOLINTNEXTLINE(readability-function-cognitive-complexity)

    void showResult(std::string const& errorMessage) const {

        ANKERL_NANOBENCH_LOG(errorMessage);


        if (mBench.output() != nullptr) {

            // prepare column data ///////

            std::vector<fmt::MarkDownColumn> columns;


            auto rMedian = mResult.median(Result::Measure::elapsed);


            if (mBench.relative()) {

                double d = 100.0;

                if (!mBench.results().empty()) {

                    d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;

                }

                columns.emplace_back(11, 1, "relative", "%", d);

            }


            if (mBench.complexityN() > 0) {

                columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());

            }


            columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",

                                 rMedian / (mBench.timeUnit().count() * mBench.batch()));

            columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);


            double const rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);

            columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);


            double rInsMedian = -1.0;

            if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {

                rInsMedian = mResult.median(Result::Measure::instructions);

                columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());

            }


            double rCycMedian = -1.0;

            if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {

                rCycMedian = mResult.median(Result::Measure::cpucycles);

                columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());

            }

            if (rInsMedian > 0.0 && rCycMedian > 0.0) {

                columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);

            }

            if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {

                double const rBraMedian = mResult.median(Result::Measure::branchinstructions);

                columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());

                if (mResult.has(Result::Measure::branchmisses)) {

                    double p = 0.0;

                    if (rBraMedian >= 1e-9) {

                        p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;

                    }

                    columns.emplace_back(10, 1, "miss%", "%", p);

                }

            }


            columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));


            // write everything

            auto& os = *mBench.output();


            // combine all elements that are relevant for printing the header

            uint64_t hash = 0;

            hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);

            hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);

            hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);

            hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);

            hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);

            hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);


            if (hash != singletonHeaderHash()) {

                singletonHeaderHash() = hash;


                // no result yet, print header

                os << std::endl;

                for (auto const& col : columns) {

                    os << col.title();

                }

                os << "| " << mBench.title() << std::endl;


                for (auto const& col : columns) {

                    os << col.separator();

                }

                os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;

            }


            if (!errorMessage.empty()) {

                for (auto const& col : columns) {

                    os << col.invalid();

                }

                os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;

            } else {

                for (auto const& col : columns) {

                    os << col.value();

                }

                os << "| ";

                auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;

                if (showUnstable) {

                    os << ":wavy_dash: ";

                }

                os << fmt::MarkDownCode(mBench.name());

                if (showUnstable) {

                    auto avgIters = d(mTotalNumIters) / d(mBench.epochs());

                    // NOLINTNEXTLINE(bugprone-incorrect-roundings)

                    auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);


                    os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)

                       << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";

                }

                os << std::endl;

            }

        }

    }


    ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {

        return elapsed * 3 >= mTargetRuntimePerEpoch * 2;

    }


    uint64_t mNumIters = 1;                            // NOLINT(misc-non-private-member-variables-in-classes)

    Bench const& mBench;                               // NOLINT(misc-non-private-member-variables-in-classes)

    std::chrono::nanoseconds mTargetRuntimePerEpoch{}; // NOLINT(misc-non-private-member-variables-in-classes)

    Result mResult;                                    // NOLINT(misc-non-private-member-variables-in-classes)

    Rng mRng{123};                                     // NOLINT(misc-non-private-member-variables-in-classes)

    std::chrono::nanoseconds mTotalElapsed{};          // NOLINT(misc-non-private-member-variables-in-classes)

    uint64_t mTotalNumIters = 0;                       // NOLINT(misc-non-private-member-variables-in-classes)

    State mState = State::upscaling_runtime;           // NOLINT(misc-non-private-member-variables-in-classes)

};

ANKERL_NANOBENCH(IGNORE_PADDED_POP)


IterationLogic::IterationLogic(Bench const& bench)

    : mPimpl(new Impl(bench)) {}


IterationLogic::~IterationLogic() {

    delete mPimpl;

}


uint64_t IterationLogic::numIters() const noexcept {

    ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);

    return mPimpl->mNumIters;

}


void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {

    mPimpl->add(elapsed, pc);

}


void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {

    results.emplace_back(std::move(mPimpl->mResult));

}


#    if ANKERL_NANOBENCH(PERF_COUNTERS)


ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)

class LinuxPerformanceCounters {

public:

    struct Target {

        Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)

            : targetValue(targetValue_)

            , correctMeasuringOverhead(correctMeasuringOverhead_)

            , correctLoopOverhead(correctLoopOverhead_) {}


        uint64_t* targetValue{};         // NOLINT(misc-non-private-member-variables-in-classes)

        bool correctMeasuringOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)

        bool correctLoopOverhead{};      // NOLINT(misc-non-private-member-variables-in-classes)

    };


    LinuxPerformanceCounters() = default;

    LinuxPerformanceCounters(LinuxPerformanceCounters const&) = delete;

    LinuxPerformanceCounters(LinuxPerformanceCounters&&) = delete;

    LinuxPerformanceCounters& operator=(LinuxPerformanceCounters const&) = delete;

    LinuxPerformanceCounters& operator=(LinuxPerformanceCounters&&) = delete;

    ~LinuxPerformanceCounters();


    // quick operation

    inline void start() {}


    inline void stop() {}


    bool monitor(perf_sw_ids swId, Target target);

    bool monitor(perf_hw_id hwId, Target target);


    ANKERL_NANOBENCH(NODISCARD) bool hasError() const noexcept {

        return mHasError;

    }


    // Just reading data is faster than enable & disabling.

    // we subtract data ourselves.

    inline void beginMeasure() {

        if (mHasError) {

            return;

        }


        // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)

        mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);

        if (mHasError) {

            return;

        }


        // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)

        mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);

    }


    inline void endMeasure() {

        if (mHasError) {

            return;

        }


        // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)

        mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));

        if (mHasError) {

            return;

        }


        auto const numBytes = sizeof(uint64_t) * mCounters.size();

        auto ret = read(mFd, mCounters.data(), numBytes);

        mHasError = ret != static_cast<ssize_t>(numBytes);

    }


    void updateResults(uint64_t numIters);


    // rounded integer division

    template <typename T>

    static inline T divRounded(T a, T divisor) {

        return (a + divisor / 2) / divisor;

    }


    ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")

    static inline uint32_t mix(uint32_t x) noexcept {

        x ^= x << 13U;

        x ^= x >> 17U;

        x ^= x << 5U;

        return x;

    }


    template <typename Op>

    ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")

    void calibrate(Op&& op) {

        // clear current calibration data,

        for (auto& v : mCalibratedOverhead) {

            v = UINT64_C(0);

        }


        // create new calibration data

        auto newCalibration = mCalibratedOverhead;

        for (auto& v : newCalibration) {

            v = (std::numeric_limits<uint64_t>::max)();

        }

        for (size_t iter = 0; iter < 100; ++iter) {

            beginMeasure();

            op();

            endMeasure();

            if (mHasError) {

                return;

            }


            for (size_t i = 0; i < newCalibration.size(); ++i) {

                auto diff = mCounters[i];

                if (newCalibration[i] > diff) {

                    newCalibration[i] = diff;

                }

            }

        }


        mCalibratedOverhead = std::move(newCalibration);


        {

            // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.

            // marsaglia's xorshift: mov, sal/shr, xor. Times 3.

            // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.

            // see https://godbolt.org/z/49RVQ5

            uint64_t const numIters = 100000U + (std::random_device{}() & 3U);

            uint64_t n = numIters;

            uint32_t x = 1234567;


            beginMeasure();

            while (n-- > 0) {

                x = mix(x);

            }

            endMeasure();

            detail::doNotOptimizeAway(x);

            auto measure1 = mCounters;


            n = numIters;

            beginMeasure();

            while (n-- > 0) {

                // we now run *twice* so we can easily calculate the overhead

                x = mix(x);

                x = mix(x);

            }

            endMeasure();

            detail::doNotOptimizeAway(x);

            auto measure2 = mCounters;


            for (size_t i = 0; i < mCounters.size(); ++i) {

                // factor 2 because we have two instructions per loop

                auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;

                auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;

                auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;


                mLoopOverhead[i] = divRounded(overhead, numIters);

            }

        }

    }


private:

    bool monitor(uint32_t type, uint64_t eventid, Target target);


    std::map<uint64_t, Target> mIdToTarget{};


    // start with minimum size of 3 for read_format

    std::vector<uint64_t> mCounters{3};

    std::vector<uint64_t> mCalibratedOverhead{3};

    std::vector<uint64_t> mLoopOverhead{3};


    uint64_t mTimeEnabledNanos = 0;

    uint64_t mTimeRunningNanos = 0;

    int mFd = -1;

    bool mHasError = false;

};

ANKERL_NANOBENCH(IGNORE_PADDED_POP)


LinuxPerformanceCounters::~LinuxPerformanceCounters() {

    if (-1 != mFd) {

        close(mFd);

    }

}


bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {

    return monitor(PERF_TYPE_SOFTWARE, swId, target);

}


bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {

    return monitor(PERF_TYPE_HARDWARE, hwId, target);

}


// overflow is ok, it's checked

ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")

void LinuxPerformanceCounters::updateResults(uint64_t numIters) {

    // clear old data

    for (auto& id_value : mIdToTarget) {

        *id_value.second.targetValue = UINT64_C(0);

    }


    if (mHasError) {

        return;

    }


    mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];

    mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];


    for (uint64_t i = 0; i < mCounters[0]; ++i) {

        auto idx = static_cast<size_t>(3 + i * 2 + 0);

        auto id = mCounters[idx + 1U];


        auto it = mIdToTarget.find(id);

        if (it != mIdToTarget.end()) {


            auto& tgt = it->second;

            *tgt.targetValue = mCounters[idx];

            if (tgt.correctMeasuringOverhead) {

                if (*tgt.targetValue >= mCalibratedOverhead[idx]) {

                    *tgt.targetValue -= mCalibratedOverhead[idx];

                } else {

                    *tgt.targetValue = 0U;

                }

            }

            if (tgt.correctLoopOverhead) {

                auto correctionVal = mLoopOverhead[idx] * numIters;

                if (*tgt.targetValue >= correctionVal) {

                    *tgt.targetValue -= correctionVal;

                } else {

                    *tgt.targetValue = 0U;

                }

            }

        }

    }

}


bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {

    *target.targetValue = (std::numeric_limits<uint64_t>::max)();

    if (mHasError) {

        return false;

    }


    auto pea = perf_event_attr();

    std::memset(&pea, 0, sizeof(perf_event_attr));

    pea.type = type;

    pea.size = sizeof(perf_event_attr);

    pea.config = eventid;

    pea.disabled = 1; // start counter as disabled

    pea.exclude_kernel = 1;

    pea.exclude_hv = 1;


    // NOLINTNEXTLINE(hicpp-signed-bitwise)

    pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;


    const int pid = 0;                    // the current process

    const int cpu = -1;                   // all CPUs

#        if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14

    const unsigned long flags = PERF_FLAG_FD_CLOEXEC;

#        else

    const unsigned long flags = 0;

#        endif


    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)

    auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));

    if (-1 == fd) {

        return false;

    }

    if (-1 == mFd) {

        // first call: set to fd, and use this from now on

        mFd = fd;

    }

    uint64_t id = 0;

    // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)

    if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {

        // couldn't get id

        return false;

    }


    // insert into map, rely on the fact that map's references are constant.

    mIdToTarget.emplace(id, target);


    // prepare readformat with the correct size (after the insert)

    auto size = 3 + 2 * mIdToTarget.size();

    mCounters.resize(size);

    mCalibratedOverhead.resize(size);

    mLoopOverhead.resize(size);


    return true;

}


PerformanceCounters::PerformanceCounters()

    : mPc(new LinuxPerformanceCounters())

    , mVal()

    , mHas() {


    // HW events

    mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));

    if (!mHas.cpuCycles) {

        // Fallback to cycles counter, reference cycles not available in many systems.

        mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));

    }

    mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));

    mHas.branchInstructions =

        mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));

    mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));

    // mHas.branchMisses = false;


    // SW events

    mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));

    mHas.contextSwitches =

        mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));


    mPc->start();

    mPc->calibrate([] {

        auto before = ankerl::nanobench::Clock::now();

        auto after = ankerl::nanobench::Clock::now();

        (void)before;

        (void)after;

    });


    if (mPc->hasError()) {

        // something failed, don't monitor anything.

        mHas = PerfCountSet<bool>{};

    }

}


PerformanceCounters::~PerformanceCounters() {

    // no need to check for nullptr, delete nullptr has no effect

    delete mPc;

}


void PerformanceCounters::beginMeasure() {

    mPc->beginMeasure();

}


void PerformanceCounters::endMeasure() {

    mPc->endMeasure();

}


void PerformanceCounters::updateResults(uint64_t numIters) {

    mPc->updateResults(numIters);

}


#    else


PerformanceCounters::PerformanceCounters() = default;

PerformanceCounters::~PerformanceCounters() = default;

void PerformanceCounters::beginMeasure() {}

void PerformanceCounters::endMeasure() {}

void PerformanceCounters::updateResults(uint64_t) {}


#    endif


ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {

    return mVal;

}

ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {

    return mHas;

}


// formatting utilities

namespace fmt {


// adds thousands separator to numbers

NumSep::NumSep(char sep)

    : mSep(sep) {}


char NumSep::do_thousands_sep() const {

    return mSep;

}


std::string NumSep::do_grouping() const {

    return "\003";

}


// RAII to save & restore a stream's state

StreamStateRestorer::StreamStateRestorer(std::ostream& s)

    : mStream(s)

    , mLocale(s.getloc())

    , mPrecision(s.precision())

    , mWidth(s.width())

    , mFill(s.fill())

    , mFmtFlags(s.flags()) {}


StreamStateRestorer::~StreamStateRestorer() {

    restore();

}


// sets back all stream info that we remembered at construction

void StreamStateRestorer::restore() {

    mStream.imbue(mLocale);

    mStream.precision(mPrecision);

    mStream.width(mWidth);

    mStream.fill(mFill);

    mStream.flags(mFmtFlags);

}


Number::Number(int width, int precision, int64_t value)

    : mWidth(width)

    , mPrecision(precision)

    , mValue(d(value)) {}


Number::Number(int width, int precision, double value)

    : mWidth(width)

    , mPrecision(precision)

    , mValue(value) {}


std::ostream& Number::write(std::ostream& os) const {

    StreamStateRestorer const restorer(os);

    os.imbue(std::locale(os.getloc(), new NumSep(',')));

    os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;

    return os;

}


std::string Number::to_s() const {

    std::stringstream ss;

    write(ss);

    return ss.str();

}


std::string to_s(uint64_t n) {

    std::string str;

    do {

        str += static_cast<char>('0' + static_cast<char>(n % 10));

        n /= 10;

    } while (n != 0);

    std::reverse(str.begin(), str.end());

    return str;

}


std::ostream& operator<<(std::ostream& os, Number const& n) {

    return n.write(os);

}


MarkDownColumn::MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept

    : mWidth(w)

    , mPrecision(prec)

    , mTitle(std::move(tit))

    , mSuffix(std::move(suff))

    , mValue(val) {}


std::string MarkDownColumn::title() const {

    std::stringstream ss;

    ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';

    return ss.str();

}


std::string MarkDownColumn::separator() const {

    std::string sep(static_cast<size_t>(mWidth), '-');

    sep.front() = '|';

    sep.back() = ':';

    return sep;

}


std::string MarkDownColumn::invalid() const {

    std::string sep(static_cast<size_t>(mWidth), ' ');

    sep.front() = '|';

    sep[sep.size() - 2] = '-';

    return sep;

}


std::string MarkDownColumn::value() const {

    std::stringstream ss;

    auto width = mWidth - 2 - static_cast<int>(mSuffix.size());

    ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';

    return ss.str();

}


// Formats any text as markdown code, escaping backticks.

MarkDownCode::MarkDownCode(std::string const& what) {

    mWhat.reserve(what.size() + 2);

    mWhat.push_back('`');

    for (char const c : what) {

        mWhat.push_back(c);

        if ('`' == c) {

            mWhat.push_back('`');

        }

    }

    mWhat.push_back('`');

}


std::ostream& MarkDownCode::write(std::ostream& os) const {

    return os << mWhat;

}


std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {

    return mdCode.write(os);

}

} // namespace fmt

} // namespace detail


// provide implementation here so it's only generated once

Config::Config() = default;

Config::~Config() = default;

Config& Config::operator=(Config const&) = default;

Config& Config::operator=(Config&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;

Config::Config(Config const&) = default;

Config::Config(Config&&) noexcept = default;


// provide implementation here so it's only generated once

Result::~Result() = default;

Result& Result::operator=(Result const&) = default;

Result& Result::operator=(Result&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;

Result::Result(Result const&) = default;

Result::Result(Result&&) noexcept = default;


namespace detail {

template <typename T>

inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {

    return static_cast<typename std::underlying_type<T>::type>(val);

}

} // namespace detail


// Result returned after a benchmark has finished. Can be used as a baseline for relative().

Result::Result(Config benchmarkConfig)

    : mConfig(std::move(benchmarkConfig))

    , mNameToMeasurements{detail::u(Result::Measure::_size)} {}


void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {

    using detail::d;

    using detail::u;


    double const dIters = d(iters);

    mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);


    mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);

    if (pc.has().pageFaults) {

        mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);

    }

    if (pc.has().cpuCycles) {

        mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);

    }

    if (pc.has().contextSwitches) {

        mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);

    }

    if (pc.has().instructions) {

        mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);

    }

    if (pc.has().branchInstructions) {

        double branchInstructions = 0.0;

        // correcting branches: remove branch introduced by the while (...) loop for each iteration.

        if (pc.val().branchInstructions > iters + 1U) {

            branchInstructions = d(pc.val().branchInstructions - (iters + 1U));

        }

        mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);


        if (pc.has().branchMisses) {

            // correcting branch misses

            double branchMisses = d(pc.val().branchMisses);

            if (branchMisses > branchInstructions) {

                // can't have branch misses when there were branches...

                branchMisses = branchInstructions;

            }


            // assuming at least one missed branch for the loop

            branchMisses -= 1.0;

            if (branchMisses < 1.0) {

                branchMisses = 1.0;

            }

            mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);

        }

    }

}


Config const& Result::config() const noexcept {

    return mConfig;

}


inline double calcMedian(std::vector<double>& data) {

    if (data.empty()) {

        return 0.0;

    }

    std::sort(data.begin(), data.end());


    auto midIdx = data.size() / 2U;

    if (1U == (data.size() & 1U)) {

        return data[midIdx];

    }

    return (data[midIdx - 1U] + data[midIdx]) / 2U;

}


double Result::median(Measure m) const {

    // create a copy so we can sort

    auto data = mNameToMeasurements[detail::u(m)];

    return calcMedian(data);

}


double Result::average(Measure m) const {

    using detail::d;

    auto const& data = mNameToMeasurements[detail::u(m)];

    if (data.empty()) {

        return 0.0;

    }


    // create a copy so we can sort

    return sum(m) / d(data.size());

}


double Result::medianAbsolutePercentError(Measure m) const {

    // create copy

    auto data = mNameToMeasurements[detail::u(m)];


    // calculates MdAPE which is the median of percentage error

    // see https://support.numxl.com/hc/en-us/articles/115001223503-MdAPE-Median-Absolute-Percentage-Error

    auto med = calcMedian(data);


    // transform the data to absolute error

    for (auto& x : data) {

        x = (x - med) / x;

        if (x < 0) {

            x = -x;

        }

    }

    return calcMedian(data);

}


double Result::sum(Measure m) const noexcept {

    auto const& data = mNameToMeasurements[detail::u(m)];

    return std::accumulate(data.begin(), data.end(), 0.0);

}


double Result::sumProduct(Measure m1, Measure m2) const noexcept {

    auto const& data1 = mNameToMeasurements[detail::u(m1)];

    auto const& data2 = mNameToMeasurements[detail::u(m2)];


    if (data1.size() != data2.size()) {

        return 0.0;

    }


    double result = 0.0;

    for (size_t i = 0, s = data1.size(); i != s; ++i) {

        result += data1[i] * data2[i];

    }

    return result;

}


bool Result::has(Measure m) const noexcept {

    return !mNameToMeasurements[detail::u(m)].empty();

}


double Result::get(size_t idx, Measure m) const {

    auto const& data = mNameToMeasurements[detail::u(m)];

    return data.at(idx);

}


bool Result::empty() const noexcept {

    return 0U == size();

}


size_t Result::size() const noexcept {

    auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];

    return data.size();

}


double Result::minimum(Measure m) const noexcept {

    auto const& data = mNameToMeasurements[detail::u(m)];

    if (data.empty()) {

        return 0.0;

    }


    // here its save to assume that at least one element is there

    return *std::min_element(data.begin(), data.end());

}


double Result::maximum(Measure m) const noexcept {

    auto const& data = mNameToMeasurements[detail::u(m)];

    if (data.empty()) {

        return 0.0;

    }


    // here its save to assume that at least one element is there

    return *std::max_element(data.begin(), data.end());

}


std::string const& Result::context(char const* variableName) const {

    return mConfig.mContext.at(variableName);

}


std::string const& Result::context(std::string const& variableName) const {

    return mConfig.mContext.at(variableName);

}


Result::Measure Result::fromString(std::string const& str) {

    if (str == "elapsed") {

        return Measure::elapsed;

    }

    if (str == "iterations") {

        return Measure::iterations;

    }

    if (str == "pagefaults") {

        return Measure::pagefaults;

    }

    if (str == "cpucycles") {

        return Measure::cpucycles;

    }

    if (str == "contextswitches") {

        return Measure::contextswitches;

    }

    if (str == "instructions") {

        return Measure::instructions;

    }

    if (str == "branchinstructions") {

        return Measure::branchinstructions;

    }

    if (str == "branchmisses") {

        return Measure::branchmisses;

    }

    // not found, return _size

    return Measure::_size;

}


// Configuration of a microbenchmark.

Bench::Bench() {

    mConfig.mOut = &std::cout;

}


Bench::Bench(Bench&&) noexcept = default;

Bench& Bench::operator=(Bench&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;

Bench::Bench(Bench const&) = default;

Bench& Bench::operator=(Bench const&) = default;

Bench::~Bench() noexcept = default;


double Bench::batch() const noexcept {

    return mConfig.mBatch;

}


double Bench::complexityN() const noexcept {

    return mConfig.mComplexityN;

}


// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%

// means it is slower than the baseline.

Bench& Bench::relative(bool isRelativeEnabled) noexcept {

    mConfig.mIsRelative = isRelativeEnabled;

    return *this;

}

bool Bench::relative() const noexcept {

    return mConfig.mIsRelative;

}


Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {

    mConfig.mShowPerformanceCounters = showPerformanceCounters;

    return *this;

}

bool Bench::performanceCounters() const noexcept {

    return mConfig.mShowPerformanceCounters;

}


// Operation unit. Defaults to "op", could be e.g. "byte" for string processing.

// If u differs from currently set unit, the stored results will be cleared.

// Use singular (byte, not bytes).

Bench& Bench::unit(char const* u) {

    if (u != mConfig.mUnit) {

        mResults.clear();

    }

    mConfig.mUnit = u;

    return *this;

}


Bench& Bench::unit(std::string const& u) {

    return unit(u.c_str());

}


std::string const& Bench::unit() const noexcept {

    return mConfig.mUnit;

}


Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {

    mConfig.mTimeUnit = tu;

    mConfig.mTimeUnitName = tuName;

    return *this;

}


std::string const& Bench::timeUnitName() const noexcept {

    return mConfig.mTimeUnitName;

}


std::chrono::duration<double> const& Bench::timeUnit() const noexcept {

    return mConfig.mTimeUnit;

}


// If benchmarkTitle differs from currently set title, the stored results will be cleared.

Bench& Bench::title(const char* benchmarkTitle) {

    if (benchmarkTitle != mConfig.mBenchmarkTitle) {

        mResults.clear();

    }

    mConfig.mBenchmarkTitle = benchmarkTitle;

    return *this;

}

Bench& Bench::title(std::string const& benchmarkTitle) {

    if (benchmarkTitle != mConfig.mBenchmarkTitle) {

        mResults.clear();

    }

    mConfig.mBenchmarkTitle = benchmarkTitle;

    return *this;

}


std::string const& Bench::title() const noexcept {

    return mConfig.mBenchmarkTitle;

}


Bench& Bench::name(const char* benchmarkName) {

    mConfig.mBenchmarkName = benchmarkName;

    return *this;

}


Bench& Bench::name(std::string const& benchmarkName) {

    mConfig.mBenchmarkName = benchmarkName;

    return *this;

}


std::string const& Bench::name() const noexcept {

    return mConfig.mBenchmarkName;

}


Bench& Bench::context(char const* variableName, char const* variableValue) {

    mConfig.mContext[variableName] = variableValue;

    return *this;

}


Bench& Bench::context(std::string const& variableName, std::string const& variableValue) {

    mConfig.mContext[variableName] = variableValue;

    return *this;

}


Bench& Bench::clearContext() {

    mConfig.mContext.clear();

    return *this;

}


// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.

Bench& Bench::epochs(size_t numEpochs) noexcept {

    mConfig.mNumEpochs = numEpochs;

    return *this;

}

size_t Bench::epochs() const noexcept {

    return mConfig.mNumEpochs;

}


// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.

Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {

    mConfig.mClockResolutionMultiple = multiple;

    return *this;

}

size_t Bench::clockResolutionMultiple() const noexcept {

    return mConfig.mClockResolutionMultiple;

}


// Sets the maximum time each epoch should take. Default is 100ms.

Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {

    mConfig.mMaxEpochTime = t;

    return *this;

}

std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {

    return mConfig.mMaxEpochTime;

}


// Sets the maximum time each epoch should take. Default is 100ms.

Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {

    mConfig.mMinEpochTime = t;

    return *this;

}

std::chrono::nanoseconds Bench::minEpochTime() const noexcept {

    return mConfig.mMinEpochTime;

}


Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {

    mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;

    return *this;

}

uint64_t Bench::minEpochIterations() const noexcept {

    return mConfig.mMinEpochIterations;

}


Bench& Bench::epochIterations(uint64_t numIters) noexcept {

    mConfig.mEpochIterations = numIters;

    return *this;

}

uint64_t Bench::epochIterations() const noexcept {

    return mConfig.mEpochIterations;

}


Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {

    mConfig.mWarmup = numWarmupIters;

    return *this;

}

uint64_t Bench::warmup() const noexcept {

    return mConfig.mWarmup;

}


Bench& Bench::config(Config const& benchmarkConfig) {

    mConfig = benchmarkConfig;

    return *this;

}

Config const& Bench::config() const noexcept {

    return mConfig;

}


Bench& Bench::output(std::ostream* outstream) noexcept {

    mConfig.mOut = outstream;

    return *this;

}


ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {

    return mConfig.mOut;

}


std::vector<Result> const& Bench::results() const noexcept {

    return mResults;

}


Bench& Bench::render(char const* templateContent, std::ostream& os) {

    ::ankerl::nanobench::render(templateContent, *this, os);

    return *this;

}


Bench& Bench::render(std::string const& templateContent, std::ostream& os) {

    ::ankerl::nanobench::render(templateContent, *this, os);

    return *this;

}


std::vector<BigO> Bench::complexityBigO() const {

    std::vector<BigO> bigOs;

    auto rangeMeasure = BigO::collectRangeMeasure(mResults);

    bigOs.emplace_back("O(1)", rangeMeasure, [](double) {

        return 1.0;

    });

    bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {

        return n;

    });

    bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {

        return std::log2(n);

    });

    bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {

        return n * std::log2(n);

    });

    bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {

        return n * n;

    });

    bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {

        return n * n * n;

    });

    std::sort(bigOs.begin(), bigOs.end());

    return bigOs;

}


Rng::Rng()

    : mX(0)

    , mY(0) {

    std::random_device rd;

    std::uniform_int_distribution<uint64_t> dist;

    do {

        mX = dist(rd);

        mY = dist(rd);

    } while (mX == 0 && mY == 0);

}


ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")

uint64_t splitMix64(uint64_t& state) noexcept {

    uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));

    z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);

    z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);

    return z ^ (z >> 31U);

}


// Seeded as described in romu paper (update april 2020)

Rng::Rng(uint64_t seed) noexcept

    : mX(splitMix64(seed))

    , mY(splitMix64(seed)) {

    for (size_t i = 0; i < 10; ++i) {

        operator()();

    }

}


// only internally used to copy the RNG.

Rng::Rng(uint64_t x, uint64_t y) noexcept

    : mX(x)

    , mY(y) {}


Rng Rng::copy() const noexcept {

    return Rng{mX, mY};

}


Rng::Rng(std::vector<uint64_t> const& data)

    : mX(0)

    , mY(0) {

    if (data.size() != 2) {

        throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +

                                 detail::fmt::to_s(data.size()));

    }

    mX = data[0];

    mY = data[1];

}


std::vector<uint64_t> Rng::state() const {

    std::vector<uint64_t> data(2);

    data[0] = mX;

    data[1] = mY;

    return data;

}


BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {

    BigO::RangeMeasure rangeMeasure;

    for (auto const& result : results) {

        if (result.config().mComplexityN > 0.0) {

            rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));

        }

    }

    return rangeMeasure;

}


BigO::BigO(std::string bigOName, RangeMeasure const& rangeMeasure)

    : mName(std::move(bigOName)) {


    // estimate the constant factor

    double sumRangeMeasure = 0.0;

    double sumRangeRange = 0.0;


    for (const auto& rm : rangeMeasure) {

        sumRangeMeasure += rm.first * rm.second;

        sumRangeRange += rm.first * rm.first;

    }

    mConstant = sumRangeMeasure / sumRangeRange;


    // calculate root mean square

    double err = 0.0;

    double sumMeasure = 0.0;

    for (const auto& rm : rangeMeasure) {

        auto diff = mConstant * rm.first - rm.second;

        err += diff * diff;


        sumMeasure += rm.second;

    }


    auto n = detail::d(rangeMeasure.size());

    auto mean = sumMeasure / n;

    mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;

}


BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)

    : BigO(std::string(bigOName), rangeMeasure) {}


std::string const& BigO::name() const noexcept {

    return mName;

}


double BigO::constant() const noexcept {

    return mConstant;

}


double BigO::normalizedRootMeanSquare() const noexcept {

    return mNormalizedRootMeanSquare;

}


bool BigO::operator<(BigO const& other) const noexcept {

    return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);

}


std::ostream& operator<<(std::ostream& os, BigO const& bigO) {

    return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();

}


std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {

    detail::fmt::StreamStateRestorer const restorer(os);

    os << std::endl << "|   coefficient |   err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;

    for (auto const& bigO : bigOs) {

        os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";

        os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";

        os << "| " << bigO.name();

        os << std::endl;

    }

    return os;

}


} // namespace nanobench

} // namespace ankerl


#endif // ANKERL_NANOBENCH_IMPLEMENT

#endif // ANKERL_NANOBENCH_H_INCLUDED

ret
int ret
Definition: bitcoin-cli.cpp:1340

flags
int flags
Definition: bitcoin-tx.cpp:529

ankerl::nanobench::Bench
Main entry point to nanobench's benchmarking facility.
Definition: nanobench.h:627

ankerl::nanobench::Bench::~Bench
~Bench() noexcept

ankerl::nanobench::Bench::operator=
Bench & operator=(Bench const &other)

ankerl::nanobench::Bench::doNotOptimizeAway
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.

ankerl::nanobench::Bench::run
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1234

ankerl::nanobench::Bench::batch
Bench & batch(T b) noexcept
Sets the batch size.
Definition: nanobench.h:1258

ankerl::nanobench::Bench::complexityBigO
std::vector< BigO > complexityBigO() const

ankerl::nanobench::Bench::Bench
Bench()
Creates a new benchmark for configuration and running of benchmarks.

ankerl::nanobench::Bench::operator=
Bench & operator=(Bench &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))

ankerl::nanobench::Bench::Bench
Bench(Bench &&other) noexcept

ankerl::nanobench::Bench::Bench
Bench(Bench const &other)

ankerl::nanobench::Bench::complexityN
Bench & complexityN(T n) noexcept
Definition: nanobench.h:1265

ankerl::nanobench::BigO
Definition: nanobench.h:1111

ankerl::nanobench::BigO::mapRangeMeasure
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1116

ankerl::nanobench::BigO::BigO
BigO(std::string bigOName, RangeMeasure const &scaledRangeMeasure)

ankerl::nanobench::BigO::RangeMeasure
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1113

ankerl::nanobench::BigO::BigO
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1126

ankerl::nanobench::BigO::collectRangeMeasure
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)

ankerl::nanobench::BigO::BigO
BigO(std::string bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1130

ankerl::nanobench::BigO::BigO
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)

ankerl::nanobench::Result
Definition: nanobench.h:419

ankerl::nanobench::Result::Result
Result(Config benchmarkConfig)

ankerl::nanobench::Result::fromString
static Measure fromString(std::string const &str)

ankerl::nanobench::Result::add
void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const &pc)

ankerl::nanobench::Result::Result
Result(Result &&other) noexcept

ankerl::nanobench::Result::~Result
~Result()

ankerl::nanobench::Result::Measure
Measure
Definition: nanobench.h:421

ankerl::nanobench::Result::Measure::branchinstructions
@ branchinstructions

ankerl::nanobench::Result::Measure::iterations
@ iterations

ankerl::nanobench::Result::Measure::cpucycles
@ cpucycles

ankerl::nanobench::Result::Measure::_size
@ _size

ankerl::nanobench::Result::Measure::elapsed
@ elapsed

ankerl::nanobench::Result::Measure::branchmisses
@ branchmisses

ankerl::nanobench::Result::Measure::instructions
@ instructions

ankerl::nanobench::Result::ANKERL_NANOBENCH
ANKERL_NANOBENCH(NODISCARD) Config const &config() const noexcept

ankerl::nanobench::Result::operator=
Result & operator=(Result const &other)

ankerl::nanobench::Result::Result
Result(Result const &other)

ankerl::nanobench::Result::operator=
Result & operator=(Result &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))

ankerl::nanobench::Rng
An extremely fast random generator.
Definition: nanobench.h:488

ankerl::nanobench::Rng::min
static constexpr uint64_t() min()

ankerl::nanobench::Rng::Rng
Rng(Rng const &)=delete
As a safety precaution, we don't allow copying.

ankerl::nanobench::Rng::shuffle
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1191

ankerl::nanobench::Rng::Rng
Rng(Rng &&) noexcept=default

ankerl::nanobench::Rng::operator=
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.

ankerl::nanobench::Rng::max
static constexpr uint64_t() max()

ankerl::nanobench::Rng::uniform01
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1181

ankerl::nanobench::Rng::result_type
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:493

ankerl::nanobench::detail::IterationLogic
Definition: nanobench.h:1060

ankerl::nanobench::detail::IterationLogic::moveResultTo
void moveResultTo(std::vector< Result > &results) noexcept

ankerl::nanobench::detail::IterationLogic::add
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept

ankerl::nanobench::detail::IterationLogic::IterationLogic
IterationLogic(IterationLogic &&)=delete

ankerl::nanobench::detail::IterationLogic::~IterationLogic
~IterationLogic()

ankerl::nanobench::detail::IterationLogic::operator=
IterationLogic & operator=(IterationLogic const &)=delete

ankerl::nanobench::detail::IterationLogic::ANKERL_NANOBENCH
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept

ankerl::nanobench::detail::IterationLogic::IterationLogic
IterationLogic(IterationLogic const &)=delete

ankerl::nanobench::detail::IterationLogic::IterationLogic
IterationLogic(Bench const &bench)

ankerl::nanobench::detail::IterationLogic::operator=
IterationLogic & operator=(IterationLogic &&)=delete

ankerl::nanobench::detail::PerformanceCounters
Definition: nanobench.h:1080

ankerl::nanobench::detail::PerformanceCounters::PerformanceCounters
PerformanceCounters(PerformanceCounters const &)=delete

ankerl::nanobench::detail::PerformanceCounters::beginMeasure
void beginMeasure()

ankerl::nanobench::detail::PerformanceCounters::operator=
PerformanceCounters & operator=(PerformanceCounters const &)=delete

ankerl::nanobench::detail::PerformanceCounters::~PerformanceCounters
~PerformanceCounters()

ankerl::nanobench::detail::PerformanceCounters::updateResults
void updateResults(uint64_t numIters)

ankerl::nanobench::detail::PerformanceCounters::operator=
PerformanceCounters & operator=(PerformanceCounters &&)=delete

ankerl::nanobench::detail::PerformanceCounters::ANKERL_NANOBENCH
ANKERL_NANOBENCH(NODISCARD) PerfCountSet< uint64_t > const &val() const noexcept

ankerl::nanobench::detail::PerformanceCounters::PerformanceCounters
PerformanceCounters(PerformanceCounters &&)=delete

ankerl::nanobench::detail::PerformanceCounters::PerformanceCounters
PerformanceCounters()

ankerl::nanobench::detail::PerformanceCounters::endMeasure
void endMeasure()

sum
volatile double sum
Definition: examples.cpp:10

T
#define T(expected, seed, data)

ankerl::nanobench::detail::doNotOptimizeAway
void doNotOptimizeAway(T &val)
Definition: nanobench.h:1045

ankerl::nanobench::detail::performanceCounters
PerformanceCounters & performanceCounters()

ankerl::nanobench::detail::doNotOptimizeAway
void doNotOptimizeAway(T const &val)
Definition: nanobench.h:1039

ankerl::nanobench::templates::json
char const * json() noexcept
Template to generate JSON data.

ankerl::nanobench::templates::csv
char const * csv() noexcept
CSV data for the benchmark results.

ankerl::nanobench::templates::pyperf
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzation.

ankerl::nanobench::templates::htmlBoxplot
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...

ankerl::nanobench::render
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.

ankerl::nanobench::Clock
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:133

ankerl::nanobench::render
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)

ankerl::nanobench::operator<<
std::ostream & operator<<(std::ostream &os, BigO const &bigO)

ankerl::nanobench::operator<<
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)

ankerl::nanobench::doNotOptimizeAway
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1279

ankerl
Definition: nanobench.h:129

detail
Definition: uint256.h:135

std
Definition: setup_common.h:276

test_vectors_musig2_generate.s
tuple s
Definition: test_vectors_musig2_generate.py:72

test_vectors_musig2_generate.data
data
Definition: test_vectors_musig2_generate.py:98

tests_wycheproof_generate_ecdsa.out
string out
Definition: tests_wycheproof_generate_ecdsa.py:24

wallet::feebumper::Result
Result
Definition: feebumper.h:24

ANKERL_NANOBENCH_LOG
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:87

ANKERL_NANOBENCH_NO_SANITIZE
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:106

ANKERL_NANOBENCH
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:49

operator==
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:607

operator<
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:612

name
const char * name
Definition: rest.cpp:49

stop
static RPCHelpMan stop()
Definition: server.cpp:156

ByteUnit::m
@ m

ByteUnit::k
@ k

ByteUnit::t
@ t

ankerl::nanobench::Config
Definition: nanobench.h:386

ankerl::nanobench::Config::operator=
Config & operator=(Config const &other)

ankerl::nanobench::Config::~Config
~Config()

ankerl::nanobench::Config::Config
Config(Config const &other)

ankerl::nanobench::Config::operator=
Config & operator=(Config &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))

ankerl::nanobench::Config::Config
Config(Config &&other) noexcept

ankerl::nanobench::Config::Config
Config()

ankerl::nanobench::detail::PerfCountSet
Definition: nanobench.h:374

ankerl::nanobench::detail::PerfCountSet::instructions
T instructions
Definition: nanobench.h:378

ankerl::nanobench::detail::PerfCountSet::pageFaults
T pageFaults
Definition: nanobench.h:375

ankerl::nanobench::detail::PerfCountSet::branchInstructions
T branchInstructions
Definition: nanobench.h:379

ankerl::nanobench::detail::PerfCountSet::branchMisses
T branchMisses
Definition: nanobench.h:380

ankerl::nanobench::detail::PerfCountSet::cpuCycles
T cpuCycles
Definition: nanobench.h:376

ankerl::nanobench::detail::PerfCountSet::contextSwitches
T contextSwitches
Definition: nanobench.h:377

rotl
static SECP256K1_INLINE uint64_t rotl(const uint64_t x, int k)
Definition: testrand_impl.h:39

count
static int count
Definition: tests_exhaustive.c:34