// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
// operations when compiling for those targets.
// External include guard in highway.h - see comment there.
// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL
#include "hwy/base.h"
// Avoid uninitialized warnings in GCC's emmintrin.h - see
// https://github.com/google/highway/issues/710 and pull/902
HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored
"-Wuninitialized")
HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
ignored
"-Wmaybe-uninitialized")
#endif
#include <emmintrin.h>
#include <stdio.h>
#if HWY_TARGET == HWY_SSSE3
#include <tmmintrin.h>
// SSSE3
#elif HWY_TARGET <= HWY_SSE4
#include <smmintrin.h>
// SSE4
#ifndef HWY_DISABLE_PCLMUL_AES
#include <wmmintrin.h>
// CLMUL
#endif
#endif
#include "hwy/ops/shared-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
// Enable generic functions for whichever of (f16, bf16) are not supported.
#if !HWY_HAVE_FLOAT16
#define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
#else
#define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
#endif
template <
typename T>
struct Raw128 {
using type = __m128i;
};
#if HWY_HAVE_FLOAT16
template <>
struct Raw128<float16_t> {
using type = __m128h;
};
#endif // HWY_HAVE_FLOAT16
template <>
struct Raw128<
float> {
using type = __m128;
};
template <>
struct Raw128<
double> {
using type = __m128d;
};
}
// namespace detail
template <
typename T, size_t N = 16 /
sizeof(T)>
class Vec128 {
using Raw =
typename detail::Raw128<T>::type;
public:
using PrivateT = T;
// only for DFromV
static constexpr size_t kPrivateN = N;
// only for DFromV
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec128&
operator*=(
const Vec128 other) {
return *
this = (*
this * other);
}
HWY_INLINE Vec128&
operator/=(
const Vec128 other) {
return *
this = (*
this / other);
}
HWY_INLINE Vec128&
operator+=(
const Vec128 other) {
return *
this = (*
this + other);
}
HWY_INLINE Vec128& operator-=(
const Vec128 other) {
return *
this = (*
this - other);
}
HWY_INLINE Vec128&
operator%=(
const Vec128 other) {
return *
this = (*
this % other);
}
HWY_INLINE Vec128&
operator&=(
const Vec128 other) {
return *
this = (*
this & other);
}
HWY_INLINE Vec128&
operator|=(
const Vec128 other) {
return *
this = (*
this | other);
}
HWY_INLINE Vec128&
operator^=(
const Vec128 other) {
return *
this = (*
this ^ other);
}
Raw raw;
};
template <
typename T>
using Vec64 = Vec128<T, 8 /
sizeof(T)>;
template <
typename T>
using Vec32 = Vec128<T, 4 /
sizeof(T)>;
template <
typename T>
using Vec16 = Vec128<T, 2 /
sizeof(T)>;
#if HWY_TARGET <= HWY_AVX3
namespace detail {
// Template arg: sizeof(lane type)
template <size_t size>
struct RawMask128 {};
template <>
struct RawMask128<1> {
using type = __mmask16;
};
template <>
struct RawMask128<2> {
using type = __mmask8;
};
template <>
struct RawMask128<4> {
using type = __mmask8;
};
template <>
struct RawMask128<8> {
using type = __mmask8;
};
}
// namespace detail
template <
typename T, size_t N = 16 /
sizeof(T)>
struct Mask128 {
using Raw =
typename detail::RawMask128<
sizeof(T)>::type;
static Mask128<T, N> FromBits(uint64_t mask_bits) {
return Mask128<T, N>{
static_cast<Raw>(mask_bits)};
}
Raw raw;
};
#else // AVX2 or below
// FF..FF or 0.
template <
typename T, size_t N = 16 /
sizeof(T)>
struct Mask128 {
typename detail::Raw128<T>::type raw;
};
#endif // AVX2 or below
namespace detail {
// Returns the lowest N of the _mm_movemask* bits.
template <
typename T, size_t N>
constexpr uint64_t OnlyActive(uint64_t mask_bits) {
return ((N *
sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
}
}
// namespace detail
#if HWY_TARGET <= HWY_AVX3
namespace detail {
// Used by Expand() emulation, which is required for both AVX3 and AVX2.
template <
typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(
const Mask128<T, N> mask) {
return OnlyActive<T, N>(mask.raw);
}
}
// namespace detail
#endif // HWY_TARGET <= HWY_AVX3
template <
class V>
using DFromV = Simd<
typename V::PrivateT, V::kPrivateN, 0>;
template <
class V>
using TFromV =
typename V::PrivateT;
// ------------------------------ Zero
// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D
/* tag */) {
return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
}
#if HWY_HAVE_FLOAT16
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API Vec128<float16_t, HWY_MAX_LANES_D(D)> Zero(D
/* tag */) {
return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_ph()};
}
#endif // HWY_HAVE_FLOAT16
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API Vec128<
float, HWY_MAX_LANES_D(D)> Zero(D
/* tag */) {
return Vec128<
float, HWY_MAX_LANES_D(D)>{_mm_setzero_ps()};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API Vec128<
double, HWY_MAX_LANES_D(D)> Zero(D
/* tag */) {
return Vec128<
double, HWY_MAX_LANES_D(D)>{_mm_setzero_pd()};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D
/* tag */) {
return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
}
// Using the existing Zero function instead of a dedicated function for
// deduction avoids having to forward-declare Vec256 here.
template <
class D>
using VFromD = decltype(Zero(D()));
// ------------------------------ Tuple (VFromD)
#include "hwy/ops/tuple-inl.h"
// ------------------------------ BitCast
namespace detail {
HWY_INLINE __m128i BitCastToInteger(__m128i v) {
return v; }
#if HWY_HAVE_FLOAT16
HWY_INLINE __m128i BitCastToInteger(__m128h v) {
return _mm_castph_si128(v); }
#endif // HWY_HAVE_FLOAT16
HWY_INLINE __m128i BitCastToInteger(__m128 v) {
return _mm_castps_si128(v); }
HWY_INLINE __m128i BitCastToInteger(__m128d v) {
return _mm_castpd_si128(v); }
template <
typename T, size_t N>
HWY_INLINE Vec128<uint8_t, N *
sizeof(T)> BitCastToByte(Vec128<T, N> v) {
return Vec128<uint8_t, N *
sizeof(T)>{BitCastToInteger(v.raw)};
}
// Cannot rely on function overloading because return types differ.
template <
typename T>
struct BitCastFromInteger128 {
HWY_INLINE __m128i
operator()(__m128i v) {
return v; }
};
#if HWY_HAVE_FLOAT16
template <>
struct BitCastFromInteger128<float16_t> {
HWY_INLINE __m128h
operator()(__m128i v) {
return _mm_castsi128_ph(v); }
};
#endif // HWY_HAVE_FLOAT16
template <>
struct BitCastFromInteger128<
float> {
HWY_INLINE __m128
operator()(__m128i v) {
return _mm_castsi128_ps(v); }
};
template <>
struct BitCastFromInteger128<
double> {
HWY_INLINE __m128d
operator()(__m128i v) {
return _mm_castsi128_pd(v); }
};
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE VFromD<D> BitCastFromByte(D
/* tag */,
Vec128<uint8_t, D().MaxBytes()> v) {
return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)};
}
}
// namespace detail
template <
class D,
typename FromT, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> BitCast(D d,
Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
}
// ------------------------------ Set
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{_mm_set1_epi8(
static_cast<
char>(t))};
// NOLINT
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{_mm_set1_epi16(
static_cast<
short>(t))};
// NOLINT
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{_mm_set1_epi32(
static_cast<
int>(t))};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{_mm_set1_epi64x(
static_cast<
long long>(t))};
// NOLINT
}
#if HWY_HAVE_FLOAT16
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, float16_t t) {
return VFromD<D>{_mm_set1_ph(t)};
}
#endif // HWY_HAVE_FLOAT16
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, float t) {
return VFromD<D>{_mm_set1_ps(t)};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, double t) {
return VFromD<D>{_mm_set1_pd(t)};
}
// Generic for all vector lengths.
template <
class D, HWY_X86_IF_EMULATED_D(D)>
HWY_API VFromD<D> Set(D df, TFromD<D> t) {
const RebindToUnsigned<decltype(df)> du;
static_assert(
sizeof(TFromD<D>) == 2,
"Expecting [b]f16");
uint16_t bits;
CopyBytes<2>(&t, &bits);
return BitCast(df, Set(du, bits));
}
// ------------------------------ Undefined
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored
"-Wuninitialized")
// Returns a vector with uninitialized elements.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API VFromD<D> Undefined(D
/* tag */) {
// Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
// generate an XOR instruction.
return VFromD<D>{_mm_undefined_si128()};
}
#if HWY_HAVE_FLOAT16
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API VFromD<D> Undefined(D
/* tag */) {
return VFromD<D>{_mm_undefined_ph()};
}
#endif // HWY_HAVE_FLOAT16
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> Undefined(D
/* tag */) {
return VFromD<D>{_mm_undefined_ps()};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> Undefined(D
/* tag */) {
return VFromD<D>{_mm_undefined_pd()};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
HWY_API VFromD<D> Undefined(D
/* tag */) {
return VFromD<D>{_mm_undefined_si128()};
}
HWY_DIAGNOSTICS(pop)
// ------------------------------ GetLane
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API T GetLane(
const Vec128<T, N> v) {
return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF);
}
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_API T GetLane(
const Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const uint16_t bits =
static_cast<uint16_t>(_mm_cvtsi128_si32(BitCast(du, v).raw) & 0xFFFF);
return BitCastScalar<T>(bits);
}
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_API T GetLane(
const Vec128<T, N> v) {
return static_cast<T>(_mm_cvtsi128_si32(v.raw));
}
template <size_t N>
HWY_API
float GetLane(
const Vec128<
float, N> v) {
return _mm_cvtss_f32(v.raw);
}
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
HWY_API T GetLane(
const Vec128<T, N> v) {
#if HWY_ARCH_X86_32
const DFromV<decltype(v)> d;
alignas(16) T lanes[2];
Store(v, d, lanes);
return lanes[0];
#else
return static_cast<T>(_mm_cvtsi128_si64(v.raw));
#endif
}
template <size_t N>
HWY_API
double GetLane(
const Vec128<
double, N> v) {
return _mm_cvtsd_f64(v.raw);
}
// ------------------------------ ResizeBitCast
template <
class D,
class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
const Repartition<uint8_t, decltype(d)> du8;
return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)});
}
// ------------------------------ Dup128VecFromValues
template <
class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
TFromD<D> t11, TFromD<D> t12,
TFromD<D> t13, TFromD<D> t14,
TFromD<D> t15) {
return VFromD<D>{_mm_setr_epi8(
static_cast<
char>(t0),
static_cast<
char>(t1),
static_cast<
char>(t2),
static_cast<
char>(t3),
static_cast<
char>(t4),
static_cast<
char>(t5),
static_cast<
char>(t6),
static_cast<
char>(t7),
static_cast<
char>(t8),
static_cast<
char>(t9),
static_cast<
char>(t10),
static_cast<
char>(t11),
static_cast<
char>(t12),
static_cast<
char>(t13),
static_cast<
char>(t14),
static_cast<
char>(t15))};
}
template <
class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
return VFromD<D>{
_mm_setr_epi16(
static_cast<int16_t>(t0),
static_cast<int16_t>(t1),
static_cast<int16_t>(t2),
static_cast<int16_t>(t3),
static_cast<int16_t>(t4),
static_cast<int16_t>(t5),
static_cast<int16_t>(t6),
static_cast<int16_t>(t7))};
}
// Generic for all vector lengths
template <
class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
const RebindToSigned<decltype(d)> di;
return BitCast(d,
Dup128VecFromValues(
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
}
#if HWY_HAVE_FLOAT16
template <
class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
return VFromD<D>{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)};
}
#else
// Generic for all vector lengths if HWY_HAVE_FLOAT16 is not true
template <
class D, HWY_IF_F16_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
const RebindToSigned<decltype(d)> di;
return BitCast(d,
Dup128VecFromValues(
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
}
#endif // HWY_HAVE_FLOAT16
template <
class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3) {
return VFromD<D>{
_mm_setr_epi32(
static_cast<int32_t>(t0),
static_cast<int32_t>(t1),
static_cast<int32_t>(t2),
static_cast<int32_t>(t3))};
}
template <
class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3) {
return VFromD<D>{_mm_setr_ps(t0, t1, t2, t3)};
}
template <
class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1) {
// Need to use _mm_set_epi64x as there is no _mm_setr_epi64x intrinsic
// available
return VFromD<D>{
_mm_set_epi64x(
static_cast<int64_t>(t1),
static_cast<int64_t>(t0))};
}
template <
class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1) {
return VFromD<D>{_mm_setr_pd(t0, t1)};
}
// ================================================== LOGICAL
// ------------------------------ And
template <
typename T, size_t N>
HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
// for float16_t
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, VFromD<decltype(du)>{
_mm_and_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
}
template <size_t N>
HWY_API Vec128<
float, N>
And(Vec128<
float, N> a, Vec128<
float, N> b) {
return Vec128<
float, N>{_mm_and_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N>
And(Vec128<
double, N> a, Vec128<
double, N> b) {
return Vec128<
double, N>{_mm_and_pd(a.raw, b.raw)};
}
// ------------------------------ AndNot
// Returns ~not_mask & mask.
template <
typename T, size_t N>
HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
const DFromV<decltype(mask)> d;
// for float16_t
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, VFromD<decltype(du)>{_mm_andnot_si128(
BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
}
template <size_t N>
HWY_API Vec128<
float, N> AndNot(Vec128<
float, N> not_mask,
Vec128<
float, N> mask) {
return Vec128<
float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> AndNot(Vec128<
double, N> not_mask,
Vec128<
double, N> mask) {
return Vec128<
double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
}
// ------------------------------ Or
template <
typename T, size_t N>
HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
// for float16_t
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, VFromD<decltype(du)>{
_mm_or_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
}
template <size_t N>
HWY_API Vec128<
float, N>
Or(Vec128<
float, N> a, Vec128<
float, N> b) {
return Vec128<
float, N>{_mm_or_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N>
Or(Vec128<
double, N> a, Vec128<
double, N> b) {
return Vec128<
double, N>{_mm_or_pd(a.raw, b.raw)};
}
// ------------------------------ Xor
template <
typename T, size_t N>
HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
// for float16_t
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, VFromD<decltype(du)>{
_mm_xor_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
}
template <size_t N>
HWY_API Vec128<
float, N>
Xor(Vec128<
float, N> a, Vec128<
float, N> b) {
return Vec128<
float, N>{_mm_xor_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N>
Xor(Vec128<
double, N> a, Vec128<
double, N> b) {
return Vec128<
double, N>{_mm_xor_pd(a.raw, b.raw)};
}
// ------------------------------ Not
template <
typename T, size_t N>
HWY_API Vec128<T, N>
Not(
const Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
#if HWY_TARGET <= HWY_AVX3
const __m128i vu = BitCast(du, v).raw;
return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
#else
return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)}));
#endif
}
// ------------------------------ Xor3
template <
typename T, size_t N>
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
#if HWY_TARGET <= HWY_AVX3
const DFromV<decltype(x1)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
const __m128i ret = _mm_ternarylogic_epi64(
BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
return BitCast(d, VU{ret});
#else
return Xor(x1,
Xor(x2, x3));
#endif
}
// ------------------------------ Or3
template <
typename T, size_t N>
HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
#if HWY_TARGET <= HWY_AVX3
const DFromV<decltype(o1)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
const __m128i ret = _mm_ternarylogic_epi64(
BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
return BitCast(d, VU{ret});
#else
return Or(o1,
Or(o2, o3));
#endif
}
// ------------------------------ OrAnd
template <
typename T, size_t N>
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
#if HWY_TARGET <= HWY_AVX3
const DFromV<decltype(o)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
const __m128i ret = _mm_ternarylogic_epi64(
BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
return BitCast(d, VU{ret});
#else
return Or(o,
And(a1, a2));
#endif
}
// ------------------------------ IfVecThenElse
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
#if HWY_TARGET <= HWY_AVX3
const DFromV<decltype(no)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
return BitCast(
d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw,
BitCast(du, no).raw, 0xCA)});
#else
return IfThenElse(MaskFromVec(mask), yes, no);
#endif
}
// ------------------------------ BitwiseIfThenElse
#if HWY_TARGET <= HWY_AVX3
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#else
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
#endif
template <
class V>
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
return IfVecThenElse(mask, yes, no);
}
#endif
// ------------------------------ Operator overloads (internal-only if float)
template <
typename T, size_t N>
HWY_API Vec128<T, N>
operator&(
const Vec128<T, N> a,
const Vec128<T, N> b) {
return And(a, b);
}
template <
typename T, size_t N>
HWY_API Vec128<T, N>
operator|(
const Vec128<T, N> a,
const Vec128<T, N> b) {
return Or(a, b);
}
template <
typename T, size_t N>
HWY_API Vec128<T, N>
operator^(
const Vec128<T, N> a,
const Vec128<T, N> b) {
return Xor(a, b);
}
// ------------------------------ PopulationCount
// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
#if HWY_TARGET <= HWY_AVX3_DL
#ifdef HWY_NATIVE_POPCNT
#undef HWY_NATIVE_POPCNT
#else
#define HWY_NATIVE_POPCNT
#endif
namespace detail {
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1>
/* tag */,
Vec128<T, N> v) {
return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2>
/* tag */,
Vec128<T, N> v) {
return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4>
/* tag */,
Vec128<T, N> v) {
return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8>
/* tag */,
Vec128<T, N> v) {
return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
}
}
// namespace detail
template <
typename T, size_t N>
HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
return detail::PopulationCount(hwy::SizeTag<
sizeof(T)>(), v);
}
#endif // HWY_TARGET <= HWY_AVX3_DL
// ================================================== SIGN
// ------------------------------ Neg
// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
namespace detail {
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> Neg(hwy::FloatTag
/*tag*/, const Vec128<T, N> v) {
return Xor(v, SignBit(DFromV<decltype(v)>()));
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> Neg(hwy::SpecialTag
/*tag*/, const Vec128<T, N> v) {
return Xor(v, SignBit(DFromV<decltype(v)>()));
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> Neg(hwy::SignedTag
/*tag*/, const Vec128<T, N> v) {
return Zero(DFromV<decltype(v)>()) - v;
}
}
// namespace detail
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> Neg(
const Vec128<T, N> v) {
return detail::Neg(hwy::TypeTag<T>(), v);
}
// ------------------------------ Floating-point Abs
// Generic for all vector lengths
template <
class V, HWY_IF_FLOAT(TFromV<V>)>
HWY_API V Abs(V v) {
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
using TI = TFromD<decltype(di)>;
return v & BitCast(d, Set(di,
static_cast<TI>(~SignMask<TI>())));
}
// ------------------------------ CopySign
// Generic for all vector lengths.
template <
class V>
HWY_API V CopySign(
const V magn,
const V sign) {
static_assert(IsFloat<TFromV<V>>(),
"Only makes sense for floating-point");
const DFromV<decltype(magn)> d;
const auto msb = SignBit(d);
// Truth table for msb, magn, sign | bitwise msb ? sign : mag
// 0 0 0 | 0
// 0 0 1 | 0
// 0 1 0 | 1
// 0 1 1 | 1
// 1 0 0 | 0
// 1 0 1 | 1
// 1 1 0 | 0
// 1 1 1 | 1
return BitwiseIfThenElse(msb, sign, magn);
}
// ------------------------------ CopySignToAbs
// Generic for all vector lengths.
template <
class V>
HWY_API V CopySignToAbs(
const V abs,
const V sign) {
const DFromV<decltype(abs)> d;
return OrAnd(abs, SignBit(d), sign);
}
// ================================================== MASK
#if HWY_TARGET <= HWY_AVX3
// ------------------------------ MaskFromVec
namespace detail {
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1>
/*tag*/,
const Vec128<T, N> v) {
return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2>
/*tag*/,
const Vec128<T, N> v) {
return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4>
/*tag*/,
const Vec128<T, N> v) {
return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8>
/*tag*/,
const Vec128<T, N> v) {
return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
}
}
// namespace detail
template <
typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(
const Vec128<T, N> v) {
return detail::MaskFromVec(hwy::SizeTag<
sizeof(T)>(), v);
}
// There do not seem to be native floating-point versions of these instructions.
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float16_t, N> MaskFromVec(
const Vec128<float16_t, N> v) {
const RebindToSigned<DFromV<decltype(v)>> di;
return Mask128<float16_t, N>{MaskFromVec(BitCast(di, v)).raw};
}
#endif
template <size_t N>
HWY_API Mask128<
float, N> MaskFromVec(
const Vec128<
float, N> v) {
const RebindToSigned<DFromV<decltype(v)>> di;
return Mask128<
float, N>{MaskFromVec(BitCast(di, v)).raw};
}
template <size_t N>
HWY_API Mask128<
double, N> MaskFromVec(
const Vec128<
double, N> v) {
const RebindToSigned<DFromV<decltype(v)>> di;
return Mask128<
double, N>{MaskFromVec(BitCast(di, v)).raw};
}
template <
class D>
using MFromD = decltype(MaskFromVec(VFromD<D>()));
// ------------------------------ MaskFalse (MFromD)
#ifdef HWY_NATIVE_MASK_FALSE
#undef HWY_NATIVE_MASK_FALSE
#else
#define HWY_NATIVE_MASK_FALSE
#endif
// Generic for all vector lengths
template <
class D>
HWY_API MFromD<D> MaskFalse(D
/*d*/) {
return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)>(0)};
}
// ------------------------------ PromoteMaskTo (MFromD)
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
#undef HWY_NATIVE_PROMOTE_MASK_TO
#else
#define HWY_NATIVE_PROMOTE_MASK_TO
#endif
// AVX3 PromoteMaskTo is generic for all vector lengths
template <
class DTo,
class DFrom,
HWY_IF_T_SIZE_GT_D(DTo,
sizeof(TFromD<DFrom>)),
class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
HWY_API MFromD<DTo> PromoteMaskTo(DTo
/*d_to*/, DFrom /*d_from*/,
MFromD<DFrom> m) {
return MFromD<DTo>{
static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
}
// ------------------------------ DemoteMaskTo (MFromD)
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
#undef HWY_NATIVE_DEMOTE_MASK_TO
#else
#define HWY_NATIVE_DEMOTE_MASK_TO
#endif
// AVX3 DemoteMaskTo is generic for all vector lengths
template <
class DTo,
class DFrom,
HWY_IF_T_SIZE_LE_D(DTo,
sizeof(TFromD<DFrom>) - 1),
class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
HWY_API MFromD<DTo> DemoteMaskTo(DTo
/*d_to*/, DFrom /*d_from*/,
MFromD<DFrom> m) {
return MFromD<DTo>{
static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
}
// ------------------------------ CombineMasks (MFromD)
#ifdef HWY_NATIVE_COMBINE_MASKS
#undef HWY_NATIVE_COMBINE_MASKS
#else
#define HWY_NATIVE_COMBINE_MASKS
#endif
template <
class D, HWY_IF_LANES_D(D, 2)>
HWY_API MFromD<D> CombineMasks(D
/*d*/, MFromD<Half<D>> hi,
MFromD<Half<D>> lo) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
const __mmask8 combined_mask = _kor_mask8(
_kshiftli_mask8(
static_cast<__mmask8>(hi.raw), 1),
_kand_mask8(
static_cast<__mmask8>(lo.raw),
static_cast<__mmask8>(1)));
#else
const auto combined_mask =
(
static_cast<
unsigned>(hi.raw) << 1) | (lo.raw & 1);
#endif
return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
}
template <
class D, HWY_IF_LANES_D(D, 4)>
HWY_API MFromD<D> CombineMasks(D
/*d*/, MFromD<Half<D>> hi,
MFromD<Half<D>> lo) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
const __mmask8 combined_mask = _kor_mask8(
_kshiftli_mask8(
static_cast<__mmask8>(hi.raw), 2),
_kand_mask8(
static_cast<__mmask8>(lo.raw),
static_cast<__mmask8>(3)));
#else
const auto combined_mask =
(
static_cast<
unsigned>(hi.raw) << 2) | (lo.raw & 3);
#endif
return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
}
template <
class D, HWY_IF_LANES_D(D, 8)>
HWY_API MFromD<D> CombineMasks(D
/*d*/, MFromD<Half<D>> hi,
MFromD<Half<D>> lo) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
const __mmask8 combined_mask = _kor_mask8(
_kshiftli_mask8(
static_cast<__mmask8>(hi.raw), 4),
_kand_mask8(
static_cast<__mmask8>(lo.raw),
static_cast<__mmask8>(15)));
#else
const auto combined_mask =
(
static_cast<
unsigned>(hi.raw) << 4) | (lo.raw & 15u);
#endif
return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
}
template <
class D, HWY_IF_LANES_D(D, 16)>
HWY_API MFromD<D> CombineMasks(D
/*d*/, MFromD<Half<D>> hi,
MFromD<Half<D>> lo) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
const __mmask16 combined_mask = _mm512_kunpackb(
static_cast<__mmask16>(hi.raw),
static_cast<__mmask16>(lo.raw));
#else
const auto combined_mask =
((
static_cast<
unsigned>(hi.raw) << 8) | (lo.raw & 0xFFu));
#endif
return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
}
// ------------------------------ LowerHalfOfMask (MFromD)
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
#else
#define HWY_NATIVE_LOWER_HALF_OF_MASK
#endif
// Generic for all vector lengths
template <
class D>
HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
using RawM = decltype(MFromD<D>().raw);
constexpr size_t kN = MaxLanes(d);
constexpr size_t kNumOfBitsInRawMask =
sizeof(RawM) * 8;
MFromD<D> result_mask{
static_cast<RawM>(m.raw)};
if (kN < kNumOfBitsInRawMask) {
result_mask =
And(result_mask, MFromD<D>{
static_cast<RawM>((1ULL << kN) - 1)});
}
return result_mask;
}
// ------------------------------ UpperHalfOfMask (MFromD)
#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
#undef HWY_NATIVE_UPPER_HALF_OF_MASK
#else
#define HWY_NATIVE_UPPER_HALF_OF_MASK
#endif
template <
class D, HWY_IF_LANES_D(D, 1)>
HWY_API MFromD<D> UpperHalfOfMask(D
/*d*/, MFromD<Twice<D>> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
const auto shifted_mask = _kshiftri_mask8(
static_cast<__mmask8>(m.raw), 1);
#else
const auto shifted_mask =
static_cast<
unsigned>(m.raw) >> 1;
#endif
return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
}
template <
class D, HWY_IF_LANES_D(D, 2)>
HWY_API MFromD<D> UpperHalfOfMask(D
/*d*/, MFromD<Twice<D>> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
const auto shifted_mask = _kshiftri_mask8(
static_cast<__mmask8>(m.raw), 2);
#else
const auto shifted_mask =
static_cast<
unsigned>(m.raw) >> 2;
#endif
return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
}
template <
class D, HWY_IF_LANES_D(D, 4)>
HWY_API MFromD<D> UpperHalfOfMask(D
/*d*/, MFromD<Twice<D>> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
const auto shifted_mask = _kshiftri_mask8(
static_cast<__mmask8>(m.raw), 4);
#else
const auto shifted_mask =
static_cast<
unsigned>(m.raw) >> 4;
#endif
return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
}
template <
class D, HWY_IF_LANES_D(D, 8)>
HWY_API MFromD<D> UpperHalfOfMask(D
/*d*/, MFromD<Twice<D>> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
const auto shifted_mask = _kshiftri_mask16(
static_cast<__mmask16>(m.raw), 8);
#else
const auto shifted_mask =
static_cast<
unsigned>(m.raw) >> 8;
#endif
return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
}
// ------------------------------ OrderedDemote2MasksTo (MFromD, CombineMasks)
#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#else
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#endif
// Generic for all vector lengths
template <
class DTo,
class DFrom,
HWY_IF_T_SIZE_D(DTo,
sizeof(TFromD<DFrom>) / 2),
class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom
/*d_from*/,
MFromD<DFrom> a, MFromD<DFrom> b) {
using MH = MFromD<Half<DTo>>;
using RawMH = decltype(MH().raw);
return CombineMasks(d_to, MH{
static_cast<RawMH>(b.raw)},
MH{
static_cast<RawMH>(a.raw)});
}
// ------------------------------ VecFromMask
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> VecFromMask(
const Mask128<T, N> v) {
return Vec128<T, N>{_mm_movm_epi8(v.raw)};
}
template <
typename T, size_t N, HWY_IF_UI16(T)>
HWY_API Vec128<T, N> VecFromMask(
const Mask128<T, N> v) {
return Vec128<T, N>{_mm_movm_epi16(v.raw)};
}
template <
typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Vec128<T, N> VecFromMask(
const Mask128<T, N> v) {
return Vec128<T, N>{_mm_movm_epi32(v.raw)};
}
template <
typename T, size_t N, HWY_IF_UI64(T)>
HWY_API Vec128<T, N> VecFromMask(
const Mask128<T, N> v) {
return Vec128<T, N>{_mm_movm_epi64(v.raw)};
}
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> VecFromMask(
const Mask128<float16_t, N> v) {
return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))};
}
#endif // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<
float, N> VecFromMask(
const Mask128<
float, N> v) {
return Vec128<
float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
}
template <size_t N>
HWY_API Vec128<
double, N> VecFromMask(
const Mask128<
double, N> v) {
return Vec128<
double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
}
// Generic for all vector lengths.
template <
class D>
HWY_API VFromD<D> VecFromMask(D
/* tag */, MFromD<D> v) {
return VecFromMask(v);
}
// ------------------------------ RebindMask (MaskFromVec)
template <
typename TFrom, size_t NFrom,
class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)>
HWY_API MFromD<DTo> RebindMask(DTo
/* tag */, Mask128<TFrom, NFrom> m) {
static_assert(
sizeof(TFrom) ==
sizeof(TFromD<DTo>),
"Must have same size");
return MFromD<DTo>{m.raw};
}
// ------------------------------ IfThenElse
namespace detail {
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return Vec128<T, N>{_mm_mask_blend_epi8(mask.raw, no.raw, yes.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<2>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return Vec128<T, N>{_mm_mask_blend_epi16(mask.raw, no.raw, yes.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<4>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return Vec128<T, N>{_mm_mask_blend_epi32(mask.raw, no.raw, yes.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return Vec128<T, N>{_mm_mask_blend_epi64(mask.raw, no.raw, yes.raw)};
}
}
// namespace detail
template <
typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return detail::IfThenElse(hwy::SizeTag<
sizeof(T)>(), mask, yes, no);
}
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> IfThenElse(Mask128<float16_t, N> mask,
Vec128<float16_t, N> yes,
Vec128<float16_t, N> no) {
return Vec128<float16_t, N>{_mm_mask_blend_ph(mask.raw, no.raw, yes.raw)};
}
#endif // HWY_HAVE_FLOAT16
// Generic for all vector lengths.
template <
class V,
class D = DFromV<V>, HWY_X86_IF_EMULATED_D(D)>
HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
const RebindToUnsigned<D> du;
return BitCast(
D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
}
template <size_t N>
HWY_API Vec128<
float, N> IfThenElse(Mask128<
float, N> mask,
Vec128<
float, N> yes, Vec128<
float, N> no) {
return Vec128<
float, N>{_mm_mask_blend_ps(mask.raw, no.raw, yes.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> IfThenElse(Mask128<
double, N> mask,
Vec128<
double, N> yes,
Vec128<
double, N> no) {
return Vec128<
double, N>{_mm_mask_blend_pd(mask.raw, no.raw, yes.raw)};
}
namespace detail {
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<1>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> yes) {
return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<2>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> yes) {
return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<4>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> yes) {
return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> yes) {
return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
}
}
// namespace detail
template <
typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
return detail::IfThenElseZero(hwy::SizeTag<
sizeof(T)>(), mask, yes);
}
template <size_t N>
HWY_API Vec128<
float, N> IfThenElseZero(Mask128<
float, N> mask,
Vec128<
float, N> yes) {
return Vec128<
float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> IfThenElseZero(Mask128<
double, N> mask,
Vec128<
double, N> yes) {
return Vec128<
double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
}
// Generic for all vector lengths.
template <
class V,
class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
HWY_API V IfThenElseZero(MFromD<D> mask, V yes) {
const RebindToUnsigned<D> du;
return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
}
namespace detail {
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<1>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> no) {
// xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<2>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> no) {
return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<4>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> no) {
return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
}
template <
typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8>
/* tag */,
Mask128<T, N> mask, Vec128<T, N> no) {
return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
}
}
// namespace detail
template <
typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
return detail::IfThenZeroElse(hwy::SizeTag<
sizeof(T)>(), mask, no);
}
template <size_t N>
HWY_API Vec128<
float, N> IfThenZeroElse(Mask128<
float, N> mask,
Vec128<
float, N> no) {
return Vec128<
float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> IfThenZeroElse(Mask128<
double, N> mask,
Vec128<
double, N> no) {
return Vec128<
double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
}
// Generic for all vector lengths.
template <
class V,
class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
HWY_API V IfThenZeroElse(MFromD<D> mask, V no) {
const RebindToUnsigned<D> du;
return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
}
// ------------------------------ Mask logical
// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
#if !
defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
HWY_COMPILER_CLANG >= 800
#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
#else
#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
#endif
#endif // HWY_COMPILER_HAS_MASK_INTRINSICS
namespace detail {
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
And(hwy::SizeTag<1>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask16>(a.raw & b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
And(hwy::SizeTag<2>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(a.raw & b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
And(hwy::SizeTag<4>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(a.raw & b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
And(hwy::SizeTag<8>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(a.raw & b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<1>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask16>(~a.raw & b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<2>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(~a.raw & b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<4>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(~a.raw & b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<8>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(~a.raw & b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Or(hwy::SizeTag<1>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask16>(a.raw | b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Or(hwy::SizeTag<2>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(a.raw | b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Or(hwy::SizeTag<4>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(a.raw | b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Or(hwy::SizeTag<8>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(a.raw | b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Xor(hwy::SizeTag<1>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask16>(a.raw ^ b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Xor(hwy::SizeTag<2>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(a.raw ^ b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Xor(hwy::SizeTag<4>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(a.raw ^ b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Xor(hwy::SizeTag<8>
/*tag*/, const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(a.raw ^ b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1>
/*tag*/,
const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2>
/*tag*/,
const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4>
/*tag*/,
const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{
static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
#endif
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8>
/*tag*/,
const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{
static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)};
#else
return Mask128<T, N>{
static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)};
#endif
}
// UnmaskedNot returns ~m.raw without zeroing out any invalid bits
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_INLINE Mask128<T, N> UnmaskedNot(
const Mask128<T, N> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{
static_cast<__mmask16>(_knot_mask16(m.raw))};
#else
return Mask128<T, N>{
static_cast<__mmask16>(~m.raw)};
#endif
}
template <
typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_INLINE Mask128<T, N> UnmaskedNot(
const Mask128<T, N> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{
static_cast<__mmask8>(_knot_mask8(m.raw))};
#else
return Mask128<T, N>{
static_cast<__mmask8>(~m.raw)};
#endif
}
template <
typename T>
HWY_INLINE Mask128<T>
Not(hwy::SizeTag<1>
/*tag*/, const Mask128<T> m) {
// sizeof(T) == 1 and N == 16: simply return ~m as all 16 bits of m are valid
return UnmaskedNot(m);
}
template <
typename T, size_t N, HWY_IF_LANES_LE(N, 8)>
HWY_INLINE Mask128<T, N>
Not(hwy::SizeTag<1>
/*tag*/, const Mask128<T, N> m) {
// sizeof(T) == 1 and N <= 8: need to zero out the upper bits of ~m as there
// are fewer than 16 valid bits in m
// Return (~m) & ((1ull << N) - 1)
return AndNot(hwy::SizeTag<1>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
}
template <
typename T>
HWY_INLINE Mask128<T>
Not(hwy::SizeTag<2>
/*tag*/, const Mask128<T> m) {
// sizeof(T) == 2 and N == 8: simply return ~m as all 8 bits of m are valid
return UnmaskedNot(m);
}
template <
typename T, size_t N, HWY_IF_LANES_LE(N, 4)>
HWY_INLINE Mask128<T, N>
Not(hwy::SizeTag<2>
/*tag*/, const Mask128<T, N> m) {
// sizeof(T) == 2 and N <= 4: need to zero out the upper bits of ~m as there
// are fewer than 8 valid bits in m
// Return (~m) & ((1ull << N) - 1)
return AndNot(hwy::SizeTag<2>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Not(hwy::SizeTag<4>
/*tag*/, const Mask128<T, N> m) {
// sizeof(T) == 4: need to zero out the upper bits of ~m as there are at most
// 4 valid bits in m
// Return (~m) & ((1ull << N) - 1)
return AndNot(hwy::SizeTag<4>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
}
template <
typename T, size_t N>
HWY_INLINE Mask128<T, N>
Not(hwy::SizeTag<8>
/*tag*/, const Mask128<T, N> m) {
// sizeof(T) == 8: need to zero out the upper bits of ~m as there are at most
// 2 valid bits in m
// Return (~m) & ((1ull << N) - 1)
return AndNot(hwy::SizeTag<8>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
}
}
// namespace detail
template <
typename T, size_t N>
HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
return detail::
And(hwy::SizeTag<
sizeof(T)>(), a, b);
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
return detail::AndNot(hwy::SizeTag<
sizeof(T)>(), a, b);
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
return detail::
Or(hwy::SizeTag<
sizeof(T)>(), a, b);
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
return detail::
Xor(hwy::SizeTag<
sizeof(T)>(), a, b);
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
// Flip only the valid bits
return detail::
Not(hwy::SizeTag<
sizeof(T)>(), m);
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(
const Mask128<T, N> a, Mask128<T, N> b) {
return detail::ExclusiveNeither(hwy::SizeTag<
sizeof(T)>(), a, b);
}
#else // AVX2 or below
// ------------------------------ Mask
// Mask and Vec are the same (true = FF..FF).
template <
typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(
const Vec128<T, N> v) {
return Mask128<T, N>{v.raw};
}
template <
class D>
using MFromD = decltype(MaskFromVec(VFromD<D>()));
template <
typename T, size_t N>
HWY_API Vec128<T, N> VecFromMask(
const Mask128<T, N> v) {
return Vec128<T, N>{v.raw};
}
// Generic for all vector lengths.
template <
class D>
HWY_API VFromD<D> VecFromMask(D
/* tag */, MFromD<D> v) {
return VecFromMask(v);
}
#if HWY_TARGET >= HWY_SSSE3
// mask ? yes : no
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask);
return Or(
And(vmask, yes), AndNot(vmask, no));
}
#else // HWY_TARGET < HWY_SSSE3
// mask ? yes : no
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
}
template <size_t N>
HWY_API Vec128<
float, N> IfThenElse(Mask128<
float, N> mask,
Vec128<
float, N> yes, Vec128<
float, N> no) {
return Vec128<
float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> IfThenElse(Mask128<
double, N> mask,
Vec128<
double, N> yes,
Vec128<
double, N> no) {
return Vec128<
double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
}
#endif // HWY_TARGET >= HWY_SSSE3
// mask ? yes : 0
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
}
// mask ? 0 : no
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
}
// ------------------------------ Mask logical
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
const Simd<T, N, 0> d;
return MaskFromVec(
Not(VecFromMask(d, m)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(
And(VecFromMask(d, a), VecFromMask(d, b)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(
Or(VecFromMask(d, a), VecFromMask(d, b)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(
Xor(VecFromMask(d, a), VecFromMask(d, b)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(
const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(AndNot(VecFromMask(d, a),
Not(VecFromMask(d, b))));
}
#endif // HWY_TARGET <= HWY_AVX3
// ------------------------------ ShiftLeft
template <
int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeft(
const Vec128<uint16_t, N> v) {
return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftLeft(
const Vec128<uint32_t, N> v) {
return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<uint64_t, N> ShiftLeft(
const Vec128<uint64_t, N> v) {
return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftLeft(
const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftLeft(
const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int64_t, N> ShiftLeft(
const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
}
#if HWY_TARGET <= HWY_AVX3_DL
namespace detail {
template <
typename T, size_t N>
HWY_API Vec128<T, N> GaloisAffine(
Vec128<T, N> v, VFromD<Repartition<uint64_t, Simd<T, N, 0>>> matrix) {
return Vec128<T, N>{_mm_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)};
}
}
// namespace detail
#else // HWY_TARGET > HWY_AVX3_DL
template <
int kBits,
typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> ShiftLeft(
const Vec128<T, N> v) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
return kBits == 1
? (v + v)
: (shifted & Set(d8,
static_cast<T>((0xFF << kBits) & 0xFF)));
}
#endif // HWY_TARGET > HWY_AVX3_DL
// ------------------------------ ShiftRight
template <
int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftRight(
const Vec128<uint16_t, N> v) {
return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftRight(
const Vec128<uint32_t, N> v) {
return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<uint64_t, N> ShiftRight(
const Vec128<uint64_t, N> v) {
return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftRight(
const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftRight(
const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
}
#if HWY_TARGET > HWY_AVX3_DL
template <
int kBits, size_t N>
HWY_API Vec128<uint8_t, N> ShiftRight(
const Vec128<uint8_t, N> v) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<uint8_t, N> shifted{
ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
return shifted & Set(d8, 0xFF >> kBits);
}
template <
int kBits, size_t N>
HWY_API Vec128<int8_t, N> ShiftRight(
const Vec128<int8_t, N> v) {
const DFromV<decltype(v)> di;
const RebindToUnsigned<decltype(di)> du;
const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
return (shifted ^ shifted_sign) - shifted_sign;
}
#endif // HWY_TARGET > HWY_AVX3_DL
// i64 is implemented after BroadcastSignBit.
// ================================================== MEMORY (1)
// Clang static analysis claims the memory immediately after a partial vector
// store is uninitialized, and also flags the input to partial loads (at least
// for loadl_pd) as "garbage". This is a false alarm because msan does not
// raise errors. We work around this by using CopyBytes instead of intrinsics,
// but only for the analyzer to avoid potentially bad code generation.
// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
#if defined(__clang_analyzer__) || \
(HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
#define HWY_SAFE_PARTIAL_LOAD_STORE 1
#else
#define HWY_SAFE_PARTIAL_LOAD_STORE 0
#endif
#endif // HWY_SAFE_PARTIAL_LOAD_STORE
// ------------------------------ Load
template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API VFromD<D> Load(D
/* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
return VFromD<D>{_mm_load_si128(
reinterpret_cast<
const __m128i*>(aligned))};
}
#if HWY_HAVE_FLOAT16
template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API Vec128<float16_t> Load(D,
const float16_t* HWY_RESTRICT aligned) {
return Vec128<float16_t>{_mm_load_ph(aligned)};
}
#endif // HWY_HAVE_FLOAT16
// Generic for all vector lengths greater than or equal to 16 bytes.
template <
class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
HWY_API VFromD<D> Load(D d,
const TFromD<D>* HWY_RESTRICT aligned) {
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, Load(du, detail::U16LanePointer(aligned)));
}
template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API Vec128<
float> Load(D
/* tag */, const float* HWY_RESTRICT aligned) {
return Vec128<
float>{_mm_load_ps(aligned)};
}
template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API Vec128<
double> Load(D
/* tag */, const double* HWY_RESTRICT aligned) {
return Vec128<
double>{_mm_load_pd(aligned)};
}
template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API VFromD<D> LoadU(D
/* tag */, const TFromD<D>* HWY_RESTRICT p) {
return VFromD<D>{_mm_loadu_si128(
reinterpret_cast<
const __m128i*>(p))};
}
#if HWY_HAVE_FLOAT16
template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API Vec128<float16_t> LoadU(D,
const float16_t* HWY_RESTRICT p) {
return Vec128<float16_t>{_mm_loadu_ph(p)};
}
#endif // HWY_HAVE_FLOAT16
// Generic for all vector lengths greater than or equal to 16 bytes.
template <
class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
HWY_API VFromD<D> LoadU(D d,
const TFromD<D>* HWY_RESTRICT p) {
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
}
template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API Vec128<
float> LoadU(D
/* tag */, const float* HWY_RESTRICT p) {
return Vec128<
float>{_mm_loadu_ps(p)};
}
template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API Vec128<
double> LoadU(D
/* tag */, const double* HWY_RESTRICT p) {
return Vec128<
double>{_mm_loadu_pd(p)};
}
template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
HWY_API VFromD<D> Load(D d,
const TFromD<D>* HWY_RESTRICT p) {
const RebindToUnsigned<decltype(d)> du;
// for float16_t
#if HWY_SAFE_PARTIAL_LOAD_STORE
__m128i v = _mm_setzero_si128();
CopyBytes<8>(p, &v);
// not same size
#else
const __m128i v = _mm_loadl_epi64(
reinterpret_cast<
const __m128i*>(p));
#endif
return BitCast(d, VFromD<decltype(du)>{v});
}
template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API Vec64<
float> Load(D
/* tag */, const float* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
__m128 v = _mm_setzero_ps();
CopyBytes<8>(p, &v);
// not same size
return Vec64<
float>{v};
#else
--> --------------------
--> maximum size reached
--> --------------------