// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// how many times we repeat each measurement. // measurements are randomly shuffled - we're not doing // all N identical measurements in a row. constint measurement_repetitions = 3;
// Timings below this value are too short to be accurate, // we'll repeat measurements with more iterations until // we get a timing above that threshold. constfloat min_accurate_time = 1e-2f;
// See --min-working-set-size command line parameter.
size_t min_working_set_size = 0;
float max_clock_speed = 0.0f;
// range of sizes that we will benchmark (in all 3 K,M,N dimensions) const size_t maxsize = 2048; const size_t minsize = 16;
static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
static_assert(maxsize > minsize, "maxsize must be larger than minsize");
static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
// just a helper to store a triple of K,M,N sizes for matrix product struct size_triple_t
{
size_t k, m, n;
size_triple_t() : k(0), m(0), n(0) {}
size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
size_triple_t(uint16_t compact)
{
k = 1 << ((compact & 0xf00) >> 8);
m = 1 << ((compact & 0x0f0) >> 4);
n = 1 << ((compact & 0x00f) >> 0);
}
};
uint8_t log2_pot(size_t x) {
size_t l = 0; while (x >>= 1) l++; return l;
}
// Convert between size tripes and a compact form fitting in 12 bits // where each size, which must be a POT, is encoded as its log2, on 4 bits // so the largest representable size is 2^15 == 32k ... big enough.
uint16_t compact_size_triple(size_t k, size_t m, size_t n)
{ return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
}
// 64 M is large enough that nobody has a cache bigger than that, // while still being small enough that everybody has this much RAM, // so conveniently we don't need to special-case platforms here. const size_t unlikely_large_cache_size = 64 << 20;
MatrixType *lhs = new MatrixType[matrix_pool_size];
MatrixType *rhs = new MatrixType[matrix_pool_size];
MatrixType *dst = new MatrixType[matrix_pool_size];
for (size_t i = 0; i < matrix_pool_size; i++) {
lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
}
// main benchmark loop
int iters_at_a_time = 1; float time_per_iter = 0.0f;
size_t matrix_index = 0; while (true) {
double starttime = timer.getCpuTime(); for (int i = 0; i < iters_at_a_time; i++) {
dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
matrix_index++; if (matrix_index == matrix_pool_size) {
matrix_index = 0;
}
} double endtime = timer.getCpuTime();
void show_usage_and_exit(int/*argc*/, char* argv[], const vector<unique_ptr<action_t>>& available_actions)
{
cerr << "usage: " << argv[0] << " [options...]" << endl << endl;
cerr << "available actions:" << endl << endl; for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
cerr << " " << (*it)->invokation_name() << endl;
}
cerr << endl;
cerr << "options:" << endl << endl;
cerr << " --min-working-set-size=N:" << endl;
cerr << " Set the minimum working set size to N bytes." << endl;
cerr << " This is rounded up as needed to a multiple of matrix size." << endl;
cerr << " A larger working set lowers the chance of a warm cache." << endl;
cerr << " The default value 0 means use a large enough working" << endl;
cerr << " set to likely outsize caches." << endl;
cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
cerr << " avoid warm caches." << endl; exit(1);
}
// multiply by an arbitrary constant to discourage trying doing anything with the // returned values besides just comparing them with each other. float result = stable_estimate * 123.456f;
// We check clock speed every minute and at the end. if (benchmark_index == benchmarks.size() ||
time_now > time_last_clock_speed_measurement + 60.0f)
{
time_last_clock_speed_measurement = time_now;
// Ensure that clock speed is as expected float current_clock_speed = measure_clock_speed();
// The tolerance needs to be smaller than the relative difference between // clock speeds that a device could operate under. // It seems unlikely that a device would be throttling clock speeds by // amounts smaller than 2%. // With a value of 1%, I was getting within noise on a Sandy Bridge. constfloat clock_speed_tolerance = 0.02f;
if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) { // Clock speed is now higher than we previously measured. // Either our initial measurement was inaccurate, which won't happen // too many times as we are keeping the best clock speed value and // and allowing some tolerance; or something really weird happened, // which invalidates all benchmark results collected so far. // Either way, we better restart all over again now. if (benchmark_index) {
cerr << "Restarting at " << 100.0f * ratio_done
<< " % because clock speed increased. " << endl;
}
max_clock_speed = current_clock_speed;
first_benchmark_to_run = 0; return;
}
bool rerun_last_tests = false;
if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
cerr << "Measurements completed so far: "
<< 100.0f * ratio_done
<< " % " << endl;
cerr << "Clock speed seems to be only "
<< current_clock_speed/max_clock_speed
<< " times what it used to be." << endl;
while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { if (seconds_to_sleep_if_lower_clock_speed > 32) {
cerr << "Sleeping longer probably won't make a difference." << endl;
cerr << "Serializing benchmarks to " << session_filename << endl;
serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
cerr << "Now restart this benchmark, and it should pick up where we left." << endl; exit(2);
}
rerun_last_tests = true;
cerr << "Sleeping "
<< seconds_to_sleep_if_lower_clock_speed
<< " s... \r" << endl;
sleep(seconds_to_sleep_if_lower_clock_speed);
current_clock_speed = measure_clock_speed();
seconds_to_sleep_if_lower_clock_speed *= 2;
}
}
if (rerun_last_tests) {
cerr << "Redoing the last "
<< 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
<< " % because clock speed had been low. " << endl; return;
}
// nothing wrong with the clock speed so far, so there won't be a need to rerun // benchmarks run so far in case we later encounter a lower clock speed.
first_benchmark_to_run = benchmark_index;
}
if (use_deserialized_benchmarks) {
benchmarks = deserialized_benchmarks;
} else { // not using deserialized benchmarks, starting from scratch
first_benchmark_to_run = 0;
// Randomly shuffling benchmarks allows us to get accurate enough progress info, // as now the cheap/expensive benchmarks are randomly mixed so they average out. // It also means that if data is corrupted for some time span, the odds are that // not all repetitions of a given benchmark will be corrupted.
random_shuffle(benchmarks.begin(), benchmarks.end());
}
for (int i = 0; i < 4; i++) {
max_clock_speed = max(max_clock_speed, measure_clock_speed());
}
// Sort timings by increasing benchmark parameters, and decreasing gflops. // The latter is very important. It means that we can ignore all but the first // benchmark with given parameters.
sort(benchmarks.begin(), benchmarks.end());
// Collect best (i.e. now first) results for each parameter values.
vector<benchmark_t> best_benchmarks; for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { if (best_benchmarks.empty() ||
best_benchmarks.back().compact_product_size != it->compact_product_size ||
best_benchmarks.back().compact_block_size != it->compact_block_size)
{
best_benchmarks.push_back(*it);
}
}
// keep and return only the best benchmarks
benchmarks = best_benchmarks;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.