// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <stdio.h>
#include <vector>
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE
"hwy/contrib/sort/bench_sort.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/algo-inl.h"
#include "hwy/contrib/sort/vqsort.h"
#include "hwy/contrib/sort/result-inl.h"
#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/tests/test_util-inl.h"
#include "hwy/timer-inl.h"
#include "hwy/timer.h"
#include "hwy/per_target.h"
// clang-format on
#if HWY_OS_LINUX
#include <unistd.h>
// usleep
#endif
// Mode for larger sorts because M1 is able to access more than the per-core
// share of L2, so 1M elements might still be in cache.
#define SORT_100M
0
#ifndef SORT_ONLY_COLD
#define SORT_ONLY_COLD
0
#endif
#ifndef SORT_BENCH_BASE_AND_PARTITION
#define SORT_BENCH_BASE_AND_PARTITION (!SORT_ONLY_COLD &&
0)
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
// Defined within HWY_ONCE, used by BenchAllSort.
extern int64_t first_sort_target;
extern int64_t first_cold_target;
// for BenchAllColdSort
namespace HWY_NAMESPACE {
namespace {
using detail::OrderAscending;
using detail::OrderDescending;
using detail::SharedTraits;
using detail::TraitsLane;
#if VQSORT_ENABLED
using detail::OrderAscending128;
using detail::OrderAscendingKV128;
using detail::Traits128;
#endif // VQSORT_ENABLED
HWY_NOINLINE
void BenchAllColdSort() {
// Only run the best(first) enabled target
if (first_cold_target ==
0) first_cold_target = HWY_TARGET;
if (HWY_TARGET != first_cold_target) {
return;
}
char cpu100[
100];
if (!platform::HaveTimerStop(cpu100)) {
fprintf(stderr,
"CPU '%s' does not support RDTSCP, skipping benchmark.\n",
cpu100);
return;
}
// Initialize random seeds
#if VQSORT_ENABLED
HWY_ASSERT(GetGeneratorState() != nullptr);
// vqsort
#endif
RandomState rng(
static_cast<uint64_t>(Unpredictable1() *
129));
// this test
using T = uint64_t;
constexpr size_t kSize =
10 *
1000;
HWY_ALIGN T items[kSize];
// Initialize array
#if 0 // optional: deliberate AVX-512 to verify VQSort performance improves
const ScalableTag<T> d;
const RebindToSigned<decltype(d)> di;
const size_t N = Lanes(d);
size_t i =
0;
for (; i + N <= kSize; i += N) {
// Super-slow scatter so that we spend enough time to warm up SKX.
const Vec<decltype(d)> val = Set(d,
static_cast<T>(Unpredictable1()));
const Vec<decltype(di)> idx =
Iota(di,
static_cast<T>(Unpredictable1() -
1));
ScatterIndex(val, d, items + i, idx);
}
for (; i < kSize; ++i) {
items[i] =
static_cast<T>(Unpredictable1());
}
#else // scalar-only, verified with clang-16
for (size_t i =
0; i < kSize; ++i) {
items[i] =
static_cast<T>(Unpredictable1());
}
#endif
items[Random32(&rng) % kSize] =
static_cast<T>(Unpredictable1() +
1);
const timer::Ticks t0 = timer::Start();
#if VQSORT_ENABLED &&
1 // change to && 0 to switch to std::sort.
VQSort(items, kSize, SortAscending());
#else
SharedState shared;
Run<SortAscending>(Algo::kStd, items, kSize, shared,
/*thread=*/0);
#endif
const timer::Ticks t1 = timer::Stop();
const double ticks =
static_cast<
double>(t1 - t0);
const double elapsed = ticks / platform::InvariantTicksPerSecond();
const double GBps = kSize *
sizeof(T) *
1E-
9 / elapsed;
fprintf(stderr,
"N=%zu GB/s=%.2f ns=%.1f random output: %g\n", kSize, GBps,
elapsed *
1E9,
static_cast<
double>(items[Random32(&rng) % kSize]));
#if SORT_ONLY_COLD
#if HWY_OS_LINUX
// Long enough for the CPU to switch off AVX-512 mode before the next run.
usleep(
100 *
1000);
// NOLINT
#endif
#endif
}
#if (VQSORT_ENABLED && SORT_BENCH_BASE_AND_PARTITION) || HWY_IDE
template <
class Traits>
HWY_NOINLINE
void BenchPartition() {
using LaneType =
typename Traits::LaneType;
using KeyType =
typename Traits::KeyType;
const SortTag<LaneType> d;
detail::SharedTraits<Traits> st;
const Dist dist = Dist::kUniform8;
double sum =
0.
0;
constexpr size_t kLPK = st.LanesPerKey();
HWY_ALIGN LaneType
buf[SortConstants::BufBytes<LaneType, kLPK>(HWY_MAX_BYTES) /
sizeof(LaneType)];
uint64_t* HWY_RESTRICT state = GetGeneratorState();
const size_t max_log2 = AdjustedLog2Reps(
20);
for (size_t log2 = max_log2; log2 < max_log2 +
1; ++log2) {
const size_t num_lanes =
1ull << log2;
const size_t num_keys = num_lanes / kLPK;
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
std::vector<
double> seconds;
const size_t num_reps = (
1ull << (
14 - log2 /
2)) *
30;
for (size_t rep =
0; rep < num_reps; ++rep) {
(
void)GenerateInput(dist, aligned.get(), num_lanes);
// The pivot value can influence performance. Do exactly what vqsort will
// do so that the performance (influenced by prefetching and branch
// prediction) is likely to predict the actual performance inside vqsort.
detail::DrawSamples(d, st, aligned.get(), num_lanes, buf, state);
detail::SortSamples(d, st, buf);
auto pivot = detail::ChoosePivotByRank(d, st, buf);
const Timestamp t0;
detail::Partition(d, st, aligned.get(), num_lanes -
1, pivot, buf);
seconds.push_back(SecondsSince(t0));
// 'Use' the result to prevent optimizing out the partition.
sum +=
static_cast<
double>(aligned.get()[num_lanes /
2]);
}
Result(Algo::kVQSort, dist, num_keys,
1, SummarizeMeasurements(seconds),
sizeof(KeyType), st.KeyString())
.Print();
}
HWY_ASSERT(sum !=
999999);
// Prevent optimizing out
}
HWY_NOINLINE
void BenchAllPartition() {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3) {
return;
}
BenchPartition<TraitsLane<OrderDescending<
float>>>();
BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
BenchPartition<Traits128<OrderAscending128>>();
// BenchPartition<Traits128<OrderDescending128>>();
BenchPartition<Traits128<OrderAscendingKV128>>();
}
template <
class Traits>
HWY_NOINLINE
void BenchBase(std::vector<Result>& results) {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
return;
}
using LaneType =
typename Traits::LaneType;
using KeyType =
typename Traits::KeyType;
const SortTag<LaneType> d;
detail::SharedTraits<Traits> st;
const Dist dist = Dist::kUniform32;
const size_t N = Lanes(d);
constexpr size_t kLPK = st.LanesPerKey();
const size_t num_lanes = SortConstants::BaseCaseNumLanes<kLPK>(N);
const size_t num_keys = num_lanes / kLPK;
auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);
std::vector<
double> seconds;
double sum =
0;
// prevents elision
constexpr size_t kMul = AdjustedReps(
600);
// ensures long enough to measure
for (size_t rep =
0; rep <
30; ++rep) {
InputStats<LaneType> input_stats =
GenerateInput(dist, keys.get(), num_lanes);
const Timestamp t0;
for (size_t i =
0; i < kMul; ++i) {
detail::BaseCase(d, st, keys.get(), num_lanes, buf.get());
sum +=
static_cast<
double>(keys[
0]);
}
seconds.push_back(SecondsSince(t0));
// printf("%f\n", seconds.back());
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes,
"BenchBase"));
}
HWY_ASSERT(sum <
1E99);
results.emplace_back(Algo::kVQSort, dist, num_keys * kMul,
1,
SummarizeMeasurements(seconds),
sizeof(KeyType),
st.KeyString());
}
HWY_NOINLINE
void BenchAllBase() {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3) {
return;
}
std::vector<Result> results;
BenchBase<TraitsLane<OrderAscending<
float>>>(results);
BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
BenchBase<Traits128<OrderAscending128>>(results);
for (
const Result& r : results) {
r.Print();
}
}
#endif // VQSORT_ENABLED && SORT_BENCH_BASE_AND_PARTITION
std::vector<Algo> AlgoForBench() {
return {
#if HAVE_AVX2SORT
Algo::kSEA,
#endif
#if HAVE_PARALLEL_IPS4O
Algo::kParallelIPS4O,
#elif HAVE_IPS4O
Algo::kIPS4O,
#endif
#if HAVE_PDQSORT
Algo::kPDQ,
#endif
#if HAVE_SORT512
Algo::kSort512,
#endif
// Only include if we're compiling for the target it supports.
#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
(!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
Algo::kVXSort,
#endif
// Only include if we're compiling for the target it supports.
#if HAVE_INTEL && HWY_TARGET <= HWY_AVX3
Algo::kIntel,
#endif
#if !HAVE_PARALLEL_IPS4O
#if !SORT_100M
// 10-20x slower, but that's OK for the default size when we are not
// testing the parallel nor 100M modes.
// Algo::kStd,
#endif
#if VQSORT_ENABLED
Algo::kVQSort,
#endif
#endif // !HAVE_PARALLEL_IPS4O
};
}
template <
class Traits>
HWY_NOINLINE
void BenchSort(size_t num_keys) {
if (first_sort_target ==
0) first_sort_target = HWY_TARGET;
SharedState shared;
detail::SharedTraits<Traits> st;
using Order =
typename Traits::Order;
using LaneType =
typename Traits::LaneType;
using KeyType =
typename Traits::KeyType;
const size_t num_lanes = num_keys * st.LanesPerKey();
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
const size_t reps = num_keys >
1000 *
1000 ?
10 :
30;
for (Algo algo : AlgoForBench()) {
// Other algorithms don't depend on the vector instructions, so only run
// them for the first target.
#if !HAVE_VXSORT
if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
continue;
}
#endif
for (Dist dist : AllDist()) {
std::vector<
double> seconds;
for (size_t rep =
0; rep < reps; ++rep) {
InputStats<LaneType> input_stats =
GenerateInput(dist, aligned.get(), num_lanes);
const Timestamp t0;
Run<Order>(algo,
reinterpret_cast<KeyType*>(aligned.get()), num_keys,
shared,
/*thread=*/0);
seconds.push_back(SecondsSince(t0));
// printf("%f\n", seconds.back());
HWY_ASSERT(
VerifySort(st, input_stats, aligned.get(), num_lanes,
"BenchSort"));
}
Result(algo, dist, num_keys,
1, SummarizeMeasurements(seconds),
sizeof(KeyType), st.KeyString())
.Print();
}
// dist
}
// algo
}
enum class BenchmarkModes {
kDefault,
k1M,
k10K,
kAllSmall,
kSmallPow2,
kSmallPow2Between,
// includes padding
kPow4,
kPow10
};
std::vector<size_t> SizesToBenchmark(BenchmarkModes mode) {
std::vector<size_t> sizes;
switch (mode) {
default:
case BenchmarkModes::kDefault:
#if HAVE_PARALLEL_IPS4O || SORT_100M
sizes.push_back(
100 *
1000 * size_t{
1000});
#else
sizes.push_back(
100);
sizes.push_back(
100 *
1000);
#endif
break;
case BenchmarkModes::k1M:
sizes.push_back(
1000 *
1000);
break;
case BenchmarkModes::k10K:
sizes.push_back(
10 *
1000);
break;
case BenchmarkModes::kAllSmall:
sizes.reserve(
128);
for (size_t i =
1; i <=
128; ++i) {
sizes.push_back(i);
}
break;
case BenchmarkModes::kSmallPow2:
for (size_t size =
2; size <=
128; size *=
2) {
sizes.push_back(size);
}
break;
case BenchmarkModes::kSmallPow2Between:
for (size_t size =
2; size <=
128; size *=
2) {
sizes.push_back(
3 * size /
2);
}
break;
case BenchmarkModes::kPow4:
for (size_t size =
4; size <=
256 *
1024; size *=
4) {
sizes.push_back(size);
}
break;
case BenchmarkModes::kPow10:
for (size_t size =
10; size <=
100 *
1000; size *=
10) {
sizes.push_back(size);
}
break;
}
return sizes;
}
HWY_NOINLINE
void BenchAllSort() {
// Not interested in benchmark results for these targets. Note that SSE4 is
// numerically less than SSE2, hence it is the lower bound.
if (HWY_SSE4 <= HWY_TARGET && HWY_TARGET <= HWY_SSE2) {
return;
}
#if HAVE_INTEL
if (HWY_TARGET > HWY_AVX3)
return;
#endif
for (size_t num_keys : SizesToBenchmark(BenchmarkModes::kSmallPow2)) {
#if !HAVE_INTEL
#if HWY_HAVE_FLOAT16
if (hwy::HaveFloat16()) {
BenchSort<TraitsLane<OtherOrder<float16_t>>>(num_keys);
}
#endif
BenchSort<TraitsLane<OrderAscending<
float>>>(num_keys);
#if HWY_HAVE_FLOAT64
if (hwy::HaveFloat64()) {
// BenchSort<TraitsLane<OtherOrder<double>>>(num_keys);
}
#endif
#endif // !HAVE_INTEL
// BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
BenchSort<TraitsLane<OtherOrder<int32_t>>>(num_keys);
BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
// BenchSort<TraitsLane<OtherOrder<uint16_t>>>(num_keys);
// BenchSort<TraitsLane<OtherOrder<uint32_t>>>(num_keys);
// BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);
#if !HAVE_VXSORT && !HAVE_INTEL && VQSORT_ENABLED
BenchSort<Traits128<OrderAscending128>>(num_keys);
BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
#endif
}
}
}
// namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
}
// namespace HWY_NAMESPACE
}
// namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
int64_t first_sort_target =
0;
// none run yet
int64_t first_cold_target =
0;
// none run yet
namespace {
HWY_BEFORE_TEST(BenchSort);
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllColdSort);
#if SORT_BENCH_BASE_AND_PARTITION
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase);
#endif
#if !SORT_ONLY_COLD
// skip (warms up vector unit for next run)
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
#endif
}
// namespace
}
// namespace hwy
#endif // HWY_ONCE