// Copyright 2021 Google LLC // Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com> // SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: BSD-3-Clause // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License.
// Target-independent types/functions defined after target-specific ops.
// The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip // the generic implementation here if native ops are already defined.
#include"hwy/base.h"
// Define detail::Shuffle1230 etc, but only when viewing the current header; // normally this is included via highway.h, which includes ops/*.h. #if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED) #include"hwy/detect_targets.h" #include"hwy/ops/emu128-inl.h" #endif// HWY_IDE
// Relies on the external include guard in highway.h.
HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE {
// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>. template <class V> using LaneType = decltype(GetLane(V()));
// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return // type of functions that do not take a vector argument, or as an argument type // if the function only has a template argument for D, or for explicit type // names instead of auto. This may be a built-in type. template <class D> using Vec = decltype(Zero(D()));
// Mask type. Useful as the return type of functions that do not take a mask // argument, or as an argument type if the function only has a template argument // for D, or for explicit type names instead of auto. template <class D> using Mask = decltype(MaskFromVec(Zero(D())));
// Returns the closest value to v within [lo, hi]. template <class V>
HWY_API V Clamp(const V v, const V lo, const V hi) { return Min(Max(lo, v), hi);
}
// CombineShiftRightBytes (and -Lanes) are not available for the scalar target, // and RVV has its own implementation of -Lanes. #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
// Returns lanes with the most significant bit set and all other bits zero. template <class D>
HWY_API Vec<D> SignBit(D d) { const RebindToUnsigned<decltype(d)> du; return BitCast(d, Set(du, SignMask<TFromD<D>>()));
}
// Returns quiet NaN. template <class D>
HWY_API Vec<D> NaN(D d) { const RebindToSigned<D> di; // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus // mantissa MSB (to indicate quiet) would be sufficient. return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
}
// Returns positive infinity. template <class D>
HWY_API Vec<D> Inf(D d) { const RebindToUnsigned<D> du; using T = TFromD<D>; using TU = TFromD<decltype(du)>; const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>()); return BitCast(d, Set(du, max_x2 >> 1));
}
// The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128 // target is in emu128-inl.h, and the implementation of // detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h #if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR namespace detail {
#if HWY_HAVE_SCALABLE template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize> /* from_size_tag */,
hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
VFromD<DFrom> v) { const Repartition<uint8_t, DTo> d_to_u8; constauto resized = ResizeBitCast(d_to_u8, v); // Zero the upper bytes which were not present/valid in d_from. const size_t num_bytes = Lanes(Repartition<uint8_t, decltype(d_from)>()); return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized));
} #else// target that uses fixed-size vectors // Truncating or same-size resizing cast: same as ResizeBitCast template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
HWY_IF_LANES_LE(kToVectSize, kFromVectSize)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize> /* from_size_tag */,
hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
VFromD<DFrom> v) { return ResizeBitCast(d_to, v);
}
// Resizing cast to vector that has twice the number of lanes of the source // vector template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
HWY_IF_LANES(kToVectSize, kFromVectSize * 2)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize> /* from_size_tag */,
hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
VFromD<DFrom> v) { const Twice<decltype(d_from)> dt_from; return BitCast(d_to, ZeroExtendVector(dt_from, v));
}
// Resizing cast to vector that has more than twice the number of lanes of the // source vector template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize> /* from_size_tag */,
hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
VFromD<DFrom> v) { using TFrom = TFromD<DFrom>;
constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom); const Repartition<TFrom, decltype(d_to)> d_resize_to; return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes),
ResizeBitCast(d_resize_to, v)));
} #endif// HWY_HAVE_SCALABLE
template <class DTo, class DFrom>
HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
static_assert( sizeof(TFromD<DTo>) > sizeof(TFromD<DFrom>), "sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)");
static_assert(
IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(), "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
template <class DTo, class DFrom>
HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
static_assert(sizeof(TFromD<DTo>) < sizeof(TFromD<DFrom>), "sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)");
static_assert(
IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(), "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
#if HWY_TARGET != HWY_SCALAR template <class DTo, class DFrom>
HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
Mask<DFrom> b) {
static_assert( sizeof(TFromD<DTo>) == sizeof(TFromD<DFrom>) / 2, "sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2");
static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(), "Mask<DTo> must be the same type as " "Mask<Repartition<TFromD<DTo>, DFrom>>>()");
#if HWY_TARGET != HWY_SCALAR template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) { // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if // D().MaxBytes() <= 16 is true return InterleaveLower(d, a, b);
} template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) { // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if // D().MaxBytes() <= 16 is true return InterleaveUpper(d, a, b);
}
// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3 // is implemented in x86_256-inl.h.
// InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is // implemented in x86_512-inl.h.
// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256 // is implemented in wasm_256-inl.h. #endif// HWY_TARGET != HWY_SCALAR
#endif// HWY_NATIVE_INTERLEAVE_WHOLE
#if HWY_TARGET != HWY_SCALAR // The InterleaveWholeLower without the optional D parameter is generic for all // vector lengths. template <class V>
HWY_API V InterleaveWholeLower(V a, V b) { return InterleaveWholeLower(DFromV<V>(), a, b);
} #endif// HWY_TARGET != HWY_SCALAR
// ------------------------------ AddSub
template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
HWY_API V AddSub(V a, V b) { // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b) return Sub(a, b);
}
// AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on // SSSE3/SSE4/AVX2/AVX3
// AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on // AVX2/AVX3 template <class V, HWY_IF_V_SIZE_GT_V(V, ((HWY_TARGET <= HWY_SSSE3 &&
hwy::IsFloat3264<TFromV<V>>())
? 32
: sizeof(TFromV<V>)))>
HWY_API V AddSub(V a, V b) { using D = DFromV<decltype(a)>; using T = TFromD<D>; using TNegate = If<!hwy::IsSigned<T>(), MakeSigned<T>, T>;
const D d; const Rebind<TNegate, D> d_negate;
// Negate the even lanes of b constauto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b))));
template <class V, class M>
HWY_API V MaskedMinOr(V no, M m, V a, V b) { return IfThenElse(m, Min(a, b), no);
}
template <class V, class M>
HWY_API V MaskedMaxOr(V no, M m, V a, V b) { return IfThenElse(m, Max(a, b), no);
}
template <class V, class M>
HWY_API V MaskedAddOr(V no, M m, V a, V b) { return IfThenElse(m, Add(a, b), no);
}
template <class V, class M>
HWY_API V MaskedSubOr(V no, M m, V a, V b) { return IfThenElse(m, Sub(a, b), no);
}
template <class V, class M>
HWY_API V MaskedMulOr(V no, M m, V a, V b) { return IfThenElse(m, Mul(a, b), no);
}
template <class V, class M>
HWY_API V MaskedDivOr(V no, M m, V a, V b) { return IfThenElse(m, Div(a, b), no);
}
template <class V, class M>
HWY_API V MaskedModOr(V no, M m, V a, V b) { return IfThenElse(m, Mod(a, b), no);
}
template <class V, class M>
HWY_API V MaskedSatAddOr(V no, M m, V a, V b) { return IfThenElse(m, SaturatedAdd(a, b), no);
}
template <class V, class M>
HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { return IfThenElse(m, SaturatedSub(a, b), no);
} #endif// HWY_NATIVE_MASKED_ARITH
// Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled, // they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set. // Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the // SumOfLanes overloads. For the latter group, we here define the remaining // overloads, plus ReduceSum which uses them plus GetLane. #if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_SCALAR #undef HWY_NATIVE_REDUCE_SCALAR #else #define HWY_NATIVE_REDUCE_SCALAR #endif
namespace detail {
// Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes. struct AddFunc { template <class V>
V operator()(V a, V b) const { return Add(a, b);
}
};
struct MinFunc { template <class V>
V operator()(V a, V b) const { return Min(a, b);
}
};
struct MaxFunc { template <class V>
V operator()(V a, V b) const { return Max(a, b);
}
};
// No-op for vectors of at most one block. template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) { return v;
}
// Reduces a lane with its counterpart in other block(s). Shared by AVX2 and // WASM_EMU256. AVX3 has its own overload. template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)>
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D /*d*/, Func f, VFromD<D> v) { return f(v, SwapAdjacentBlocks(v));
}
// These return the reduction result broadcasted across all lanes. They assume // the caller has already reduced across blocks.
// AVX3 has target-specific implementations of these. #if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ISINF #undef HWY_NATIVE_ISINF #else #define HWY_NATIVE_ISINF #endif
template <class V, class D = DFromV<V>>
HWY_API MFromD<D> IsInf(const V v) { using T = TFromD<D>; const D d; const RebindToUnsigned<decltype(d)> du; const VFromD<decltype(du)> vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask(
d,
Eq(Add(vu, vu),
Set(du, static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>()))));
}
// Returns whether normal/subnormal/zero. template <class V, class D = DFromV<V>>
HWY_API MFromD<D> IsFinite(const V v) { using T = TFromD<D>; const D d; const RebindToUnsigned<decltype(d)> du; const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison const VFromD<decltype(du)> vu = BitCast(du, v); // 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code // for AVX2 if we instead add vu + vu. #if HWY_COMPILER_MSVC const VFromD<decltype(du)> shl = ShiftLeft<1>(vu); #else const VFromD<decltype(du)> shl = Add(vu, vu); #endif
// Then shift right so we can compare with the max exponent (cannot compare // with MaxExponentTimes2 directly because it is negative and non-negative // floats would be greater). const VFromD<decltype(di)> exp =
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl)); return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
}
#if HWY_IDE template <class V>
HWY_INLINE V ShuffleTwo1230(V a, V /* b */) { return a;
} template <class V>
HWY_INLINE V ShuffleTwo2301(V a, V /* b */) { return a;
} template <class V>
HWY_INLINE V ShuffleTwo3012(V a, V /* b */) { return a;
} #endif// HWY_IDE
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& A, VFromD<D>& B,
VFromD<D>& C) {
constexpr size_t kN = MaxLanes(d);
A = LoadU(d, unaligned + 0 * kN);
B = LoadU(d, unaligned + 1 * kN);
C = LoadU(d, unaligned + 2 * kN);
}
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { using V = VFromD<D>;
V A; // v0[1] v2[0] v1[0] v0[0]
V B; // v1[2] v0[2] v2[1] v1[1]
V C; // v2[3] v1[3] v0[3] v2[2]
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
const V vxx_02_03_xx = OddEven(C, B);
v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx);
// Shuffle2301 takes the upper/lower halves of the output from one input, so // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use // OddEven because it may have higher throughput than Shuffle. const V vxx_xx_10_11 = OddEven(A, B); const V v12_13_xx_xx = OddEven(B, C);
v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx);
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) { const Repartition<uint64_t, decltype(d)> d64; using V64 = VFromD<decltype(d64)>; using V = VFromD<D>; // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD. // Here int[i] means the four interleaved values of the i-th 4-tuple and // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
V vA; // int[13..10] int[3..0]
V vB; // int[17..14] int[7..4]
V vC; // int[1b..18] int[b..8]
V vD; // int[1f..1c] int[f..c]
detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
// For brevity, the comments only list the lower block (upper = lower + 0x10) const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0] const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8] const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2] const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a]
const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0] const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8] const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1] const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9]
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) { // In the last step, we interleave by half of the block size, which is usually // 8 bytes but half that for 8-bit x8 vectors. using TW = hwy::UnsignedFromSize<d.MaxBytes() == 8 ? 4 : 8>; const Repartition<TW, decltype(d)> dw; using VW = VFromD<decltype(dw)>;
// (Comments are for 256-bit vectors.) // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD.
VFromD<D> vA; // v3210[9]v3210[8] v3210[1]v3210[0]
VFromD<D> vB; // v3210[b]v3210[a] v3210[3]v3210[2]
VFromD<D> vC; // v3210[d]v3210[c] v3210[5]v3210[4]
VFromD<D> vD; // v3210[f]v3210[e] v3210[7]v3210[6]
detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be // filled from other vectors are 0 for blending. Note that these are byte // indices for 16-bit lanes. const VFromD<decltype(du8)> shuf_A1 =
Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5); const VFromD<decltype(du8)> shuf_A2 =
Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);
// The interleaved vectors will be named A, B, C; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. constauto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
template <class DTo, class DFrom>
HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
VFromD<DFrom> v) { #if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3/SSE4, the LoadU operation will zero out any lanes of v.raw // past the first (lowest-index) Lanes(d_from) lanes of v.raw if // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true
(void)d_from; return ResizeBitCast(d_to, v); #else // On other targets such as PPC/NEON, the contents of any lanes past the first // (lowest-index) Lanes(d_from) lanes of v.raw might be non-zero if // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true. return ZeroExtendResizeBitCast(d_to, d_from, v); #endif
}
// For SVE and non-sanitizer AVX-512; RVV has its own specialization. template <class D>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { #if HWY_MEM_OPS_MIGHT_FAULT if (num_lanes <= 0) return Zero(d); #endif
return MaskedLoad(FirstN(d, num_lanes), d, p);
}
template <class D>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { #if HWY_MEM_OPS_MIGHT_FAULT if (num_lanes <= 0) return no; #endif
// First, do a signed to signed demotion. This will convert any values // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a // negative value. constauto i2i_demote_result = DemoteTo(dn, BitCast(di, v));
// Second, convert any negative values to hwy::HighestValue<TFromD<DN>>() // using an unsigned Min operation. constauto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
#if HWY_TARGET != HWY_SCALAR || HWY_IDE template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), class V2 = VFromD<Repartition<TFromV<V>, DN>>,
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { const DFromV<decltype(a)> d; const RebindToSigned<decltype(d)> di; const RebindToUnsigned<decltype(dn)> dn_u;
// First, do a signed to signed demotion. This will convert any values // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a // negative value. constauto i2i_demote_result =
ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b));
// Second, convert any negative values to hwy::HighestValue<TFromD<DN>>() // using an unsigned Min operation. constauto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
// There is no codegen advantage for a native version of this. It is provided // only for convenience. template <class D, class V>
HWY_API VFromD<D> PromoteLowerTo(D d, V v) { // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V // because it cannot be deduced from D (could be either bf16 or f16). const Rebind<TFromV<V>, decltype(d)> dh; return PromoteTo(d, LowerHalf(dh, v));
}
// This requires UpperHalf. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D, class V>
HWY_API VFromD<D> PromoteUpperTo(D d, V v) { // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V // because it cannot be deduced from D (could be either bf16 or f16). const Rebind<TFromV<V>, decltype(d)> dh; return PromoteTo(d, UpperHalf(dh, v));
}
// Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as // there are target-specific specializations for some of the // detail::PromoteEvenTo and detail::PromoteOddTo cases on // SVE/PPC/SSE2/SSSE3/SSE4/AVX2.
// All targets except HWY_SCALAR use the implementations of // detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at // least some of the PromoteEvenTo and PromoteOddTo cases.
// Signed to signed PromoteEvenTo/PromoteOddTo template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(
hwy::SignedTag /*to_type_tag*/,
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
hwy::SignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, need to shift each lane of the bitcasted vector // left by kToLaneSize * 4 bits to get the bits of the even source lanes into // the upper kToLaneSize * 4 bits of even_in_hi. constauto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v)); #else // On big-endian targets, the bits of the even source lanes are already in // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector. constauto even_in_hi = BitCast(d_to, v); #endif
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
hwy::SignedTag /*to_type_tag*/,
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
hwy::SignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, the bits of the odd source lanes are already in // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector. constauto odd_in_hi = BitCast(d_to, v); #else // On big-endian targets, need to shift each lane of the bitcasted vector left // by kToLaneSize * 4 bits to get the bits of the odd source lanes into the // upper kToLaneSize * 4 bits of odd_in_hi. constauto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v)); #endif
// Unsigned to unsigned PromoteEvenTo/PromoteOddTo template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(
hwy::UnsignedTag /*to_type_tag*/,
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, the bits of the even source lanes are already // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
// Simply need to zero out the upper bits of each lane of the bitcasted // vector. returnAnd(BitCast(d_to, v),
Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>()))); #else // On big-endian targets, need to shift each lane of the bitcasted vector // right by kToLaneSize * 4 bits to get the bits of the even source lanes into // the lower kToLaneSize * 4 bits of the result.
// The right shift below will zero out the upper kToLaneSize * 4 bits of the // result. return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v)); #endif
}
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
hwy::UnsignedTag /*to_type_tag*/,
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, need to shift each lane of the bitcasted vector // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into // the lower kToLaneSize * 4 bits of the result.
// The right shift below will zero out the upper kToLaneSize * 4 bits of the // result. return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v)); #else // On big-endian targets, the bits of the even source lanes are already // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
// Simply need to zero out the upper bits of each lane of the bitcasted // vector. returnAnd(BitCast(d_to, v),
Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>()))); #endif
}
// Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo // followed by BitCast to signed template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(
hwy::SignedTag /*to_type_tag*/,
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { const RebindToUnsigned<decltype(d_to)> du_to; return BitCast(d_to,
PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
hwy::UnsignedTag(), du_to, v));
}
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
hwy::SignedTag /*to_type_tag*/,
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { const RebindToUnsigned<decltype(d_to)> du_to; return BitCast(d_to,
PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
hwy::UnsignedTag(), du_to, v));
}
// BF16->F32 PromoteEvenTo
// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag // instead of hwy::FloatTag on targets that use scalable vectors.
// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered // to be a bfloat16_t vector. template <class FromTypeTag, class DF32, class VBF16, class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
hwy::SizeTag<4> /*to_lane_size_tag*/,
FromTypeTag /*from_type_tag*/, DF32 d_to,
VBF16 v) { const RebindToUnsigned<decltype(d_to)> du_to; #if HWY_IS_LITTLE_ENDIAN // On little-endian platforms, need to shift left each lane of the bitcasted // vector by 16 bits. return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); #else // On big-endian platforms, the even lanes of the source vector are already // in the upper 16 bits of the lanes of the bitcasted vector.
// Need to simply zero out the lower 16 bits of each lane of the bitcasted // vector. return BitCast(d_to, And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); #endif
}
// BF16->F32 PromoteOddTo
// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag // instead of hwy::FloatTag on targets that use scalable vectors.
// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered // to be a bfloat16_t vector. template <class FromTypeTag, class DF32, class VBF16, class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
hwy::SizeTag<4> /*to_lane_size_tag*/,
FromTypeTag /*from_type_tag*/, DF32 d_to,
VBF16 v) { const RebindToUnsigned<decltype(d_to)> du_to; #if HWY_IS_LITTLE_ENDIAN // On little-endian platforms, the odd lanes of the source vector are already // in the upper 16 bits of the lanes of the bitcasted vector.
// Need to simply zero out the lower 16 bits of each lane of the bitcasted // vector. return BitCast(d_to, And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); #else // On big-endian platforms, need to shift left each lane of the bitcasted // vector by 16 bits. return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); #endif
}
// Default PromoteEvenTo/PromoteOddTo implementations template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D, class V, HWY_IF_LANES_D(D, 1)>
HWY_INLINE VFromD<D> PromoteEvenTo(
ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
FromTypeTag /*from_type_tag*/, D d_to, V v) { return PromoteLowerTo(d_to, v);
}
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D, class V, HWY_IF_LANES_GT_D(D, 1)>
HWY_INLINE VFromD<D> PromoteEvenTo(
ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
FromTypeTag /*from_type_tag*/, D d_to, V v) { const DFromV<decltype(v)> d; return PromoteLowerTo(d_to, ConcatEven(d, v, v));
}
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
FromTypeTag /*from_type_tag*/, D d_to, V v) { const DFromV<decltype(v)> d; return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
}
} // namespace detail
template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)), class V2 = VFromD<Repartition<TFromV<V>, D>>,
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
HWY_API VFromD<D> PromoteEvenTo(D d, V v) { return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
hwy::SizeTag<sizeof(TFromD<D>)>(),
hwy::TypeTag<TFromV<V>>(), d, v);
}
// There are 23 fractional bits (plus the implied 1 bit) in the mantissa of // a F32, and there are 10 fractional bits (plus the implied 1 bit) in the // mantissa of a F16
// We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as // 2^(-14) is the smallest positive normal F16 value and as we want 13 // mantissa bits (including the implicit 1 bit) to the left of the // F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13
// The biased exponent of round_incr[i] needs to be at least 126 as // (-14) + 13 + 127 is equal to 126
// We also want to biased exponent of round_incr[i] to be less than or equal // to 255 (which is equal to MaxExponentField<float>())
// The biased F64 exponent of round_incr is equal to // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)
// hi9_bits[i] is equal to the upper 9 bits of v[i] constauto hi9_bits = ShiftRight<23>(BitCast(du32, v));
// round_incr_hi9_bits[i] is equivalent to // (hi9_bits[i] & 0x100) | // HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126)
#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 constauto k255 = Set(du32, uint32_t{255u}); constauto round_incr_hi9_bits = BitwiseIfThenElse(
k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits); #else // On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can // be incremented by 13 and clamped to the [13, 255] range without overflowing // into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8 // exponent bits in an F32
// U8 Max can be used on targets other than SCALAR and EMU128 to clamp // ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign // bit
// Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa // and to move the fractional bits of the resulting non-NaN mantissa down to // the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN // value constauto rounded_val = Add(v, round_incr);
// rounded_val_bits is the bits of rounded_val as a U32 constauto rounded_val_bits = BitCast(du32, rounded_val);
// rounded_val[i] is known to have the same biased exponent as round_incr[i] // as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite // value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]| // is either a power of 2 that is greater than or equal to 2^-1 or infinity.
// If rounded_val[i] is a finite F32 value, then // (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the // rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is // in the range [0, 2].
// In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800, // with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the // resulting F16 mantissa, if rounded_v[i] is a finite F32 value.
// (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if // rounded_val[i] is a non-NaN value
// The biased exponent of rounded_val[i] is guaranteed to be at least 126 as // the biased exponent of round_incr[i] is at least 126 and as both v[i] and // round_incr[i] have the same sign bit
// The ULP of a F32 value with a biased exponent of 126 is equal to // 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a // F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to // -24)
// The biased exponent (before subtracting by 126) needs to be clamped to the // [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest // biased exponent of a F16.
// The biased exponent of the resulting F16 value is equal to // HWY_MIN((round_incr_hi9_bits[i] & 0xFF) + // ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126
// f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17 // bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow // efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo // operation constauto f16_bits_as_i32 =
OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)),
Set(di32, static_cast<int32_t>(0xFFFF8000u))); return BitCast(df16, DemoteTo(di16, f16_bits_as_i32));
}
// The mantissa bits of v[i] are first rounded using round-to-odd rounding to // the nearest F64 value that has the lower 29 bits zeroed out to ensure that // the result is correctly rounded to a F16.
// On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed // by a unsigned right shift of the uint32_t bit representation of the // floating point values by 23, followed by an int16_t Min // operation as we are only interested in the biased exponent that would // result from a uint32_t to float conversion.
// An int32_t to float vector conversion is also much more efficient on // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2 // requires multiple instructions whereas an int32_t to float vector // conversion can be carried out using a single instruction on // SSE2/SSSE3/SSE4/AVX2.
template <class V, HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { // If v[i] >= 16777216 is true, make sure that the bit at // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact // conversion to single-precision floating point is rounded down.
// This zeroing-out can be accomplished through the AndNot operation below. return AndNot(ShiftRight<24>(v), v);
}
} // namespace detail
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V HighestSetBitIndex(V v) { const DFromV<decltype(v)> d; const RebindToUnsigned<decltype(d)> du; using TU = TFromD<decltype(du)>;
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V LeadingZeroCount(V v) { const DFromV<decltype(v)> d; const RebindToUnsigned<decltype(d)> du; using TU = TFromD<decltype(du)>;
// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
// Define for white-box testing, even if native instructions are available. namespace detail {
// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with // Vector Permute Instructions" and the accompanying assembly language // implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan: // https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html . // // A brute-force 256 byte table lookup can also be made constant-time, and // possibly competitive on NEON, but this is more performance-portable // especially for x86 and large vectors.
template <class V> // u8
HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
V affine_tblU) { const DFromV<V> du; constauto mask = Set(du, uint8_t{0xF});
// Apply the inverse affine transformation constauto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)), Or(ShiftLeft<3>(state), ShiftRight<5>(state)), Or(ShiftLeft<6>(state), ShiftRight<2>(state))),
Set(du, uint8_t{0x05}));
// The GF(2^8) multiplicative inverse is computed as follows: // - Changing the polynomial basis to GF(2^4) // - Computing the GF(2^4) multiplicative inverse // - Converting the GF(2^4) multiplicative inverse to the GF(2^8) // multiplicative inverse through table lookups using the // kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL,
gF2P4InvToGF2P8InvU);
}
template <class V> // u8
HWY_API V AESRound(V state, const V round_key) { // Intel docs swap the first two steps, but it does not matter because // ShiftRows is a permutation and SubBytes is independent of lane index.
state = detail::SubBytes(state);
state = detail::ShiftRows(state);
state = detail::MixColumns(state);
state = Xor(state, round_key); // AddRoundKey return state;
}
template <class V> // u8
HWY_API V AESLastRound(V state, const V round_key) { // LIke AESRound, but without MixColumns.
state = detail::SubBytes(state);
state = detail::ShiftRows(state);
state = Xor(state, round_key); // AddRoundKey return state;
}
template <class V> // u8
HWY_API V AESRoundInv(V state, const V round_key) {
state = detail::InvSubBytes(state);
state = detail::InvShiftRows(state);
state = detail::InvMixColumns(state);
state = Xor(state, round_key); // AddRoundKey return state;
}
template <class V> // u8
HWY_API V AESLastRoundInv(V state, const V round_key) { // Like AESRoundInv, but without InvMixColumns.
state = detail::InvSubBytes(state);
state = detail::InvShiftRows(state);
state = Xor(state, round_key); // AddRoundKey return state;
}
// This overload requires vectors to be at least 16 bytes, which is the case // for LMUL >= 2. #undef HWY_IF_POPCNT #if HWY_TARGET == HWY_RVV #define HWY_IF_POPCNT(D) \
hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr #else // Other targets only have these two overloads which are mutually exclusive, so // no further conditions are required. #define HWY_IF_POPCNT(D) void* = nullptr #endif// HWY_TARGET == HWY_RVV
template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
HWY_API V PopulationCount(V v) { const D d; const V lookup =
Dup128VecFromValues(d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); constauto lo = And(v, Set(d, uint8_t{0xF})); constauto hi = ShiftRight<4>(v); return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
}
// RVV has a specialization that avoids the Set(). #if HWY_TARGET != HWY_RVV // Slower fallback for capped vectors. template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API V PopulationCount(V v) { const D d; // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3 const V k33 = Set(d, uint8_t{0x33});
v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
v = Add(And(ShiftRight<2>(v), k33), And(v, k33)); returnAnd(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
} #endif// HWY_TARGET != HWY_RVV
template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
HWY_API V PopulationCount(V v) { const D d; const Repartition<uint8_t, decltype(d)> d8; constauto vals = BitCast(d, PopulationCount(BitCast(d8, v))); return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
}
template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
HWY_API V PopulationCount(V v) { const D d;
Repartition<uint16_t, decltype(d)> d16; auto vals = BitCast(d, PopulationCount(BitCast(d16, v))); return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
}
#if HWY_HAVE_INTEGER64 template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
HWY_API V PopulationCount(V v) { const D d;
Repartition<uint32_t, decltype(d)> d32; auto vals = BitCast(d, PopulationCount(BitCast(d32, v))); return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
} #endif
// 8 bit and fits in wider reg: promote template <class V, HWY_IF_T_SIZE_V(V, 1),
HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
HWY_API V operator*(const V a, const V b) { const DFromV<decltype(a)> d; const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw; const RebindToUnsigned<decltype(d)> du; // TruncateTo result const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b); // TruncateTo is cheaper than ConcatEven. return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
}
// MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to // MulSub(mul, x, sub_or_add) template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
HWY_API V MulAddSub(V mul, V x, V sub_or_add) { return MulSub(mul, x, sub_or_add);
}
// MulAddSub for F16/F32/F64 vectors with 2 or more lanes on // SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and // x86_512-inl.h template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | ((HWY_TARGET <= HWY_SSSE3 &&
hwy::IsFloat<TFromV<V>>())
? 0
: ((1 << 2) | (1 << 4) |
(1 << 8))))>
HWY_API V MulAddSub(V mul, V x, V sub_or_add) { using D = DFromV<V>; using T = TFromD<D>; using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
// If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32 // IntDivConvIntToFloat(df, vi) returns an approximation of // static_cast<float>(v[i]) that is within 4 ULP of static_cast<float>(v[i]) template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) { const Twice<decltype(df32)> dt_f32;
auto vf32 =
ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi));
template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)>
HWY_INLINE V IntDivUsingFloatDiv(V a, V b) { const DFromV<decltype(a)> d; const RebindToFloat<decltype(d)> df;
// If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the // [LimitsMin<SignedFromSize<kOrigLaneSize>>(), // LimitsMax<UnsignedFromSize<kOrigLaneSize>>()] range.
// floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also // guaranteed to be true if MakeFloat<T> has at least kOrigLaneSize*8 + 1 // mantissa bits (including the implied one bit), where flt_q is equal to // static_cast<MakeFloat<T>>(a[i]) / static_cast<MakeFloat<T>>(b[i]), // even in the case where the magnitude of an inexact floating point division // result is rounded up.
// In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true // if (a[i] % b[i]) != 0 is true and MakeFloat<T> has at least // kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in // the case where the magnitude of an inexact floating point division result // is rounded up.
#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
!HWY_HAVE_FLOAT64 // On Armv7, do division by multiplying by the ApproximateReciprocal // to avoid unnecessary overhead as F32 Div refines the approximate // reciprocal using 4 Newton-Raphson iterations
const RebindToSigned<decltype(d)> di; const RebindToUnsigned<decltype(d)> du;
constauto flt_b = ConvertTo(df, b); auto flt_recip_b = ApproximateReciprocal(flt_b); if (kOrigLaneSize > 1) {
flt_recip_b =
Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
}
// Need to negate r1[i] if a[i] < 0 is true if (IsSigned<TFromV<V>>()) {
r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1);
}
// r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i]
auto abs_b = BitCast(du, b); if (IsSigned<TFromV<V>>()) {
abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
}
// If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1. // Otherwise, set q1[i] to 0.
// (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned // comparison as static_cast<TU>(r1[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i] // will be true if r1[i] < 0 is true. auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b)));
// q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0
// Need to negate q1[i] if r0[i] and b[i] do not have the same sign auto q1_negate_mask = r0; if (IsSigned<TFromV<V>>()) {
q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b));
}
q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1);
// q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? // (((r0[i] ^ b[i]) < 0) ? 1 : -1)
// Need to subtract q1[i] from q0[i] to get the final result return Sub(q0, BitCast(d, q1)); #else // On targets other than Armv7 NEON, use F16 or F32 division as most targets // other than Armv7 NEON have native F32 divide instructions return ConvertTo(d, Div(ConvertTo(df, a), ConvertTo(df, b))); #endif
}
template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
HWY_INLINE V IntDivUsingFloatDiv(V a, V b) { // If kOrigLaneSize == sizeof(T) is true, at least two reciprocal // multiplication steps are needed as the mantissa of MakeFloat<T> has fewer // than kOrigLaneSize*8 + 1 bits
using T = TFromV<V>;
#if HWY_HAVE_FLOAT64 using TF = MakeFloat<T>; #else using TF = float; #endif
// Need to negate r4[i] if a[i] < 0 is true if (IsSigned<TFromV<V>>()) {
r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4);
}
// r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i]
auto abs_b = BitCast(du, b); if (IsSigned<TFromV<V>>()) {
abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
}
// If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1. // Otherwise, set r4[i] to 0.
// (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned // comparison as static_cast<TU>(r4[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i] // will be true if r4[i] < 0 is true. auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b)));
// q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0
// Need to negate q4[i] if r3[i] and b[i] do not have the same sign auto q4_negate_mask = r3; if (IsSigned<TFromV<V>>()) {
q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b));
}
q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4);
// q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? // (((r3[i] ^ b[i]) < 0) ? 1 : -1)
// The final result is equal to q0[i] + q1[i] - q4[i] return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4));
}
template <size_t kOrigLaneSize, class V,
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
HWY_IF_V_SIZE_LE_V(
V, HWY_MAX_BYTES /
((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))>
HWY_INLINE V IntDiv(V a, V b) { using T = TFromV<V>;
// If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32 using TW = MakeWide< If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1), MakeWide<T>, T>>;
#if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid // unnecessary overhead const RebindToSigned<decltype(dw)> dw_i;
// On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<T> if // kOrigLaneSize < sizeof(T) to avoid unnecessary overhead constIf<(kOrigLaneSize < sizeof(T)), RebindToSigned<decltype(d)>,
decltype(d)>
d_demote_to; #else // On other targets, promote to TW and demote to T const decltype(dw) dw_i; const decltype(d) d_demote_to; #endif
template <size_t kOrigLaneSize, class V,
HWY_IF_T_SIZE_ONE_OF_V(V,
(HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)),
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) { const DFromV<decltype(a)> d; const RepartitionToWide<decltype(d)> dw;
#if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid // unnecessary overhead const RebindToSigned<decltype(dw)> dw_i;
// On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<TFromV<V>> if // kOrigLaneSize < sizeof(TFromV<V>) to avoid unnecessary overhead constIf<(kOrigLaneSize < sizeof(TFromV<V>)), RebindToSigned<decltype(d)>,
decltype(d)>
d_demote_to; #else // On other targets, promote to MakeWide<TFromV<V>> and demote to TFromV<V> const decltype(dw) dw_i; const decltype(d) d_demote_to; #endif
#if !HWY_HAVE_FLOAT16 template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) { const DFromV<decltype(a)> d; const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
#if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary // overhead const RebindToSigned<decltype(dw)> dw_i; #else // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V> const decltype(dw) dw_i; #endif
return DemoteTo(d,
BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b))));
} template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) { const DFromV<decltype(a)> d; const RepartitionToWide<decltype(d)> dw;
#if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary // overhead const RebindToSigned<decltype(dw)> dw_i; #else // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V> const decltype(dw) dw_i; #endif
// NOTE: SatWidenMulPairwiseAdd(di16, a_u, b_i) cannot be used in // SumOfMulQuadAccumulate as it is possible for // a_u[0]*b_i[0]+a_u[1]*b_i[1] to overflow an int16_t if a_u[0], b_i[0], // a_u[1], and b_i[1] are all non-zero and b_i[0] and b_i[1] have the same // sign.
// WidenMulPairwiseAdd(di32, a, b) is okay here as // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if // a[0], b[0], a[1], and b[1] are all equal to -32768.
// The upper 32 bits of sum0 and sum1 need to be zeroed out in the case of // overflow. constauto hi32_mask = Set(di64, static_cast<int64_t>(~int64_t{0xFFFFFFFF})); constauto p0_zero_out_mask =
ShiftLeft<32>(BitCast(di64, i32_pairwise_sum_overflow)); constauto p1_zero_out_mask = And(BitCast(di64, i32_pairwise_sum_overflow), hi32_mask);
for (size_t i = 0; i < Lanes(d); i += 8) { // Each byte worth of bits is the index of one of 256 8-byte ranges, and its // population count determines how far to advance the write position. const size_t bits8 = bits[i / 8]; constauto indices = Load(d8, table + bits8 * 8); constauto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices);
StoreU(compressed, d8, pos);
pos += PopCount(bits8);
} returnstatic_cast<size_t>(pos - unaligned);
}
template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) {
uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)];
(void)StoreMaskBits(d, mask, bits); return CompressBitsStore(v, bits, d, unaligned);
}
template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API size_t CompressBlendedStore(V v, M mask, D d,
T* HWY_RESTRICT unaligned) {
HWY_ALIGN T buf[MaxLanes(d)]; const size_t bytes = CompressStore(v, mask, d, buf);
BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned); return bytes;
}
// For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE. template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
HWY_API V Compress(V v, const M mask) { const DFromV<V> d;
HWY_ALIGN T lanes[MaxLanes(d)];
(void)CompressStore(v, mask, d, lanes); return Load(d, lanes);
}
template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
HWY_API V CompressNot(V v, M mask) { return Compress(v, Not(mask));
}
#endif// HWY_NATIVE_COMPRESS8
// ------------------------------ Expand
// Note that this generic implementation assumes <= 128 bit fixed vectors; // the SVE and RVV targets provide their own native implementations. #if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE #ifdef HWY_NATIVE_EXPAND #undef HWY_NATIVE_EXPAND #else #define HWY_NATIVE_EXPAND #endif
// We want to skip past the v bytes already consumed by idxL. There is no // instruction for shift-reg by variable bytes. Storing v itself would work // but would involve a store-load forwarding stall. We instead shuffle using // loaded indices. multishift_epi64_epi8 would also help, but if we have that, // we probably also have native 8-bit Expand.
alignas(16) static constexpr uint8_t iota[32] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; const VFromD<decltype(du)> shift = LoadU(du, iota + PopCount(maskL)); const VFromD<decltype(duh)> vL = LowerHalf(duh, vu); const VFromD<decltype(duh)> vH =
LowerHalf(duh, TableLookupBytesOr0(vu, shift));
// For lane i, shift the i-th 4-bit index down to bits [0, 2). const Vec128<uint32_t, N> packed = Set(du, packed_array[mask_bits]);
alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12};
Vec128<uint32_t, N> indices = packed >> Load(du, shifts); // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec // checks bounds, so clear the upper bits.
indices = And(indices, Set(du, N - 1)); const Vec128<uint32_t, N> expand =
TableLookupLanes(BitCast(du, v), IndicesFromVec(du, indices)); // TableLookupLanes cannot also zero masked-off lanes, so do that now. return IfThenElseZero(mask, BitCast(d, expand));
}
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) { // Same as Compress, just zero out the mask=false lanes. return IfThenElseZero(mask, Compress(v, mask));
}
// For single-element vectors, this is at least as fast as native. template <typename T>
HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) { return IfThenElseZero(mask, v);
}
#undef HWY_PREFER_ROTATE // Platforms on which RotateRight is likely faster than TableLookupBytes. // RVV and SVE anyway have their own implementation of this. #if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \
HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8 #define HWY_PREFER_ROTATE 1 #else #define HWY_PREFER_ROTATE 0 #endif
// On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore // require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit // shifts because those would add extra masking already taken care of by // UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to // implement ReverseBits, so this code is not used there. #undef HWY_REVERSE_BITS_MIN_BYTES #if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \
HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256) #define HWY_REVERSE_BITS_MIN_BYTES 2 #else #define HWY_REVERSE_BITS_MIN_BYTES 1 #endif
// The detail::Per4LaneBlockShuffle overloads that have the extra lane_size_tag // and vect_size_tag parameters are only called for vectors that have at // least 4 lanes (or scalable vectors that might possibly have 4 or more lanes) template <size_t kIdx3210, size_t kLaneSize, size_t kVectSize, class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
hwy::SizeTag<kVectSize> /*vect_size_tag*/,
V v) { return TblLookupPer4LaneBlkShuf(v, kIdx3210);
}
#if HWY_TARGET != HWY_SCALAR template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
static_assert(0 <= kAOffset && kAOffset <= 1, "kAOffset must be between 0 and 1");
static_assert(0 <= kBOffset && kBOffset <= 3, "kBOffset must be between 0 and 3"); using D8 = DFromV<V8>; const D8 d8; const RebindToUnsigned<decltype(d8)> du8; const RepartitionToWide<decltype(d8)> d16; const RepartitionToWide<decltype(du8)> du16;
// Ensure that a is resized to a vector that has at least // HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and // CombineShiftRightBytes operations below. #if HWY_TARGET == HWY_RVV // On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true // to ensure that Lanes(d8_interleave) >= 16 is true.
// Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV // targets as d8_interleave.Pow2() >= d8.Pow2() is true.
constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0); const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave; #elif HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
HWY_TARGET == HWY_SVE2_128 // On SVE targets, Lanes(d8_interleave) >= 16 and // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD // tag for a full u8/i8 vector on SVE. const D8 d8_interleave; #else // On targets that use non-scalable vector types, Lanes(d8_interleave) is // equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset).
constexpr size_t kInterleaveLanes =
HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset); const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave; #endif
// The ResizeBitCast operation below will resize a to a vector that has // at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the // InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations // below. constauto a_to_interleave = ResizeBitCast(d8_interleave, a);
// a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of // the CombineShiftRightBytes are needed for the subsequent AbsDiff operations // and as a01 and a23 need to be the same vector type as b01 and b23 for the // AbsDiff operations below. const V8 a01 =
ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 1>(
d8_interleave, a_interleaved_hi, a_interleaved_lo)); const V8 a23 =
ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 5>(
d8_interleave, a_interleaved_hi, a_interleaved_lo));
#if HWY_TARGET != HWY_SCALAR template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
HWY_IF_UI8_D(DFromV<V8>)>
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
V8 b) {
static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
#if HWY_TARGET == HWY_RVV // On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that // both vA and vB can be bitcasted to a u32 vector. const detail::AdjustSimdTagToMinVecPow2<
RepartitionToWideX2<DFromV<decltype(a)>>>
d32; const RepartitionToNarrow<decltype(d32)> d16; const RepartitionToNarrow<decltype(d16)> d8;
// SVE* and RVV currently cannot define operators and have already defined // (only) the corresponding functions such as Add. #if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS #undef HWY_NATIVE_OPERATOR_REPLACEMENTS #else #define HWY_NATIVE_OPERATOR_REPLACEMENTS #endif
template <class V>
HWY_API V Add(V a, V b) { return a + b;
} template <class V>
HWY_API V Sub(V a, V b) { return a - b;
}
template <class V>
HWY_API V Mul(V a, V b) { return a * b;
} template <class V>
HWY_API V Div(V a, V b) { return a / b;
} template <class V>
HWY_API V Mod(V a, V b) { return a % b;
}
template <class V>
V Shl(V a, V b) { return a << b;
} template <class V>
V Shr(V a, V b) { return a >> b;
}
template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) { return a == b;
} template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) { return a != b;
} template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) { return a < b;
}
template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) { return a > b;
} template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) { return a >= b;
}
template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) { return a <= b;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.