Quelle generic_ops-inl.h

Sprache: C

// Copyright 2021 Google LLC
// Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
// SPDX-License-Identifier: Apache-2.0
// SPDX-License-Identifier: BSD-3-Clause
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Target-independent types/functions defined after target-specific ops.

// The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip
// the generic implementation here if native ops are already defined.

#include "hwy/base.h"

// Define detail::Shuffle1230 etc, but only when viewing the current header;
// normally this is included via highway.h, which includes ops/*.h.
#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
#include "hwy/detect_targets.h"
#include "hwy/ops/emu128-inl.h"
#endif  // HWY_IDE

// Relies on the external include guard in highway.h.
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {

// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
template <class V>
using LaneType = decltype(GetLane(V()));

// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
// type of functions that do not take a vector argument, or as an argument type
// if the function only has a template argument for D, or for explicit type
// names instead of auto. This may be a built-in type.
template <class D>
using Vec = decltype(Zero(D()));

// Mask type. Useful as the return type of functions that do not take a mask
// argument, or as an argument type if the function only has a template argument
// for D, or for explicit type names instead of auto.
template <class D>
using Mask = decltype(MaskFromVec(Zero(D())));

// Returns the closest value to v within [lo, hi].
template <class V>
HWY_API V Clamp(const V v, const V lo, const V hi) {
  return Min(Max(lo, v), hi);
}

// CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
// and RVV has its own implementation of -Lanes.
#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV

template <size_t kLanes, class D>
HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) {
  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
  static_assert(kBytes < 16, "Shift count is per-block");
  return CombineShiftRightBytes<kBytes>(d, hi, lo);
}

#endif

// Returns lanes with the most significant bit set and all other bits zero.
template <class D>
HWY_API Vec<D> SignBit(D d) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Set(du, SignMask<TFromD<D>>()));
}

// Returns quiet NaN.
template <class D>
HWY_API Vec<D> NaN(D d) {
  const RebindToSigned<D> di;
  // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
  // mantissa MSB (to indicate quiet) would be sufficient.
  return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
}

// Returns positive infinity.
template <class D>
HWY_API Vec<D> Inf(D d) {
  const RebindToUnsigned<D> du;
  using T = TFromD<D>;
  using TU = TFromD<decltype(du)>;
  const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
  return BitCast(d, Set(du, max_x2 >> 1));
}

// ------------------------------ ZeroExtendResizeBitCast

// The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128
// target is in emu128-inl.h, and the implementation of
// detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h
#if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
namespace detail {

#if HWY_HAVE_SCALABLE
template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
    hwy::SizeTag<kFromVectSize> /* from_size_tag */,
    hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
    VFromD<DFrom> v) {
  const Repartition<uint8_t, DTo> d_to_u8;
  const auto resized = ResizeBitCast(d_to_u8, v);
  // Zero the upper bytes which were not present/valid in d_from.
  const size_t num_bytes = Lanes(Repartition<uint8_t, decltype(d_from)>());
  return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized));
}
#else   // target that uses fixed-size vectors
// Truncating or same-size resizing cast: same as ResizeBitCast
template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
          HWY_IF_LANES_LE(kToVectSize, kFromVectSize)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
    hwy::SizeTag<kFromVectSize> /* from_size_tag */,
    hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
    VFromD<DFrom> v) {
  return ResizeBitCast(d_to, v);
}

// Resizing cast to vector that has twice the number of lanes of the source
// vector
template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
          HWY_IF_LANES(kToVectSize, kFromVectSize * 2)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
    hwy::SizeTag<kFromVectSize> /* from_size_tag */,
    hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
    VFromD<DFrom> v) {
  const Twice<decltype(d_from)> dt_from;
  return BitCast(d_to, ZeroExtendVector(dt_from, v));
}

// Resizing cast to vector that has more than twice the number of lanes of the
// source vector
template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
          HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
    hwy::SizeTag<kFromVectSize> /* from_size_tag */,
    hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
    VFromD<DFrom> v) {
  using TFrom = TFromD<DFrom>;
  constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom);
  const Repartition<TFrom, decltype(d_to)> d_resize_to;
  return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes),
                                      ResizeBitCast(d_resize_to, v)));
}
#endif  // HWY_HAVE_SCALABLE

}  // namespace detail
#endif  // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR

template <class DTo, class DFrom>
HWY_API VFromD<DTo> ZeroExtendResizeBitCast(DTo d_to, DFrom d_from,
                                            VFromD<DFrom> v) {
  return detail::ZeroExtendResizeBitCast(hwy::SizeTag<d_from.MaxBytes()>(),
                                         hwy::SizeTag<d_to.MaxBytes()>(), d_to,
                                         d_from, v);
}

// ------------------------------ SafeFillN

template <class D, typename T = TFromD<D>>
HWY_API void SafeFillN(const size_t num, const T value, D d,
                       T* HWY_RESTRICT to) {
#if HWY_MEM_OPS_MIGHT_FAULT
  (void)d;
  for (size_t i = 0; i < num; ++i) {
    to[i] = value;
  }
#else
  BlendedStore(Set(d, value), FirstN(d, num), d, to);
#endif
}

// ------------------------------ SafeCopyN

template <class D, typename T = TFromD<D>>
HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
                       T* HWY_RESTRICT to) {
#if HWY_MEM_OPS_MIGHT_FAULT
  (void)d;
  for (size_t i = 0; i < num; ++i) {
    to[i] = from[i];
  }
#else
  const Mask<D> mask = FirstN(d, num);
  BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
#endif
}

// ------------------------------ MaskFalse
#if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_MASK_FALSE
#undef HWY_NATIVE_MASK_FALSE
#else
#define HWY_NATIVE_MASK_FALSE
#endif

template <class D>
HWY_API Mask<D> MaskFalse(D d) {
  return MaskFromVec(Zero(d));
}

#endif  // HWY_NATIVE_MASK_FALSE

// ------------------------------ BitwiseIfThenElse
#if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#else
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
#endif

template <class V>
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
  return Or(And(mask, yes), AndNot(mask, no));
}

#endif  // HWY_NATIVE_BITWISE_IF_THEN_ELSE

// ------------------------------ PromoteMaskTo

#if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
#undef HWY_NATIVE_PROMOTE_MASK_TO
#else
#define HWY_NATIVE_PROMOTE_MASK_TO
#endif

template <class DTo, class DFrom>
HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
  static_assert(
      sizeof(TFromD<DTo>) > sizeof(TFromD<DFrom>),
      "sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)");
  static_assert(
      IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
      "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");

  const RebindToSigned<decltype(d_to)> di_to;
  const RebindToSigned<decltype(d_from)> di_from;

  return MaskFromVec(BitCast(
      d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
}

#endif  // HWY_NATIVE_PROMOTE_MASK_TO

// ------------------------------ DemoteMaskTo

#if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
#undef HWY_NATIVE_DEMOTE_MASK_TO
#else
#define HWY_NATIVE_DEMOTE_MASK_TO
#endif

template <class DTo, class DFrom>
HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
  static_assert(sizeof(TFromD<DTo>) < sizeof(TFromD<DFrom>),
                "sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)");
  static_assert(
      IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
      "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");

  const RebindToSigned<decltype(d_to)> di_to;
  const RebindToSigned<decltype(d_from)> di_from;

  return MaskFromVec(
      BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
}

#endif  // HWY_NATIVE_DEMOTE_MASK_TO

// ------------------------------ CombineMasks

#if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_COMBINE_MASKS
#undef HWY_NATIVE_COMBINE_MASKS
#else
#define HWY_NATIVE_COMBINE_MASKS
#endif

#if HWY_TARGET != HWY_SCALAR
template <class D>
HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
  const Half<decltype(d)> dh;
  return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo)));
}
#endif

#endif  // HWY_NATIVE_COMBINE_MASKS

// ------------------------------ LowerHalfOfMask

#if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
#else
#define HWY_NATIVE_LOWER_HALF_OF_MASK
#endif

template <class D>
HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) {
  const Twice<decltype(d)> dt;
  return MaskFromVec(LowerHalf(d, VecFromMask(dt, m)));
}

#endif  // HWY_NATIVE_LOWER_HALF_OF_MASK

// ------------------------------ UpperHalfOfMask

#if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
#undef HWY_NATIVE_UPPER_HALF_OF_MASK
#else
#define HWY_NATIVE_UPPER_HALF_OF_MASK
#endif

#if HWY_TARGET != HWY_SCALAR
template <class D>
HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
  const Twice<decltype(d)> dt;
  return MaskFromVec(UpperHalf(d, VecFromMask(dt, m)));
}
#endif

#endif  // HWY_NATIVE_UPPER_HALF_OF_MASK

// ------------------------------ OrderedDemote2MasksTo

#if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
     defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#else
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#endif

#if HWY_TARGET != HWY_SCALAR
template <class DTo, class DFrom>
HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
                                        Mask<DFrom> b) {
  static_assert(
      sizeof(TFromD<DTo>) == sizeof(TFromD<DFrom>) / 2,
      "sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2");
  static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(),
                "Mask<DTo> must be the same type as "
                "Mask<Repartition<TFromD<DTo>, DFrom>>>()");

  const RebindToSigned<decltype(d_from)> di_from;
  const RebindToSigned<decltype(d_to)> di_to;

  const auto va = BitCast(di_from, VecFromMask(d_from, a));
  const auto vb = BitCast(di_from, VecFromMask(d_from, b));
  return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)));
}
#endif

#endif  // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO

// ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
#if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
#undef HWY_NATIVE_INTERLEAVE_WHOLE
#else
#define HWY_NATIVE_INTERLEAVE_WHOLE
#endif

#if HWY_TARGET != HWY_SCALAR
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
  // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
  // D().MaxBytes() <= 16 is true
  return InterleaveLower(d, a, b);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
  // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if
  // D().MaxBytes() <= 16 is true
  return InterleaveUpper(d, a, b);
}

// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3
// is implemented in x86_256-inl.h.

// InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is
// implemented in x86_512-inl.h.

// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256
// is implemented in wasm_256-inl.h.
#endif  // HWY_TARGET != HWY_SCALAR

#endif  // HWY_NATIVE_INTERLEAVE_WHOLE

#if HWY_TARGET != HWY_SCALAR
// The InterleaveWholeLower without the optional D parameter is generic for all
// vector lengths.
template <class V>
HWY_API V InterleaveWholeLower(V a, V b) {
  return InterleaveWholeLower(DFromV<V>(), a, b);
}
#endif  // HWY_TARGET != HWY_SCALAR

// ------------------------------ AddSub

template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
HWY_API V AddSub(V a, V b) {
  // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b)
  return Sub(a, b);
}

// AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on
// SSSE3/SSE4/AVX2/AVX3

// AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
// AVX2/AVX3
template <class V, HWY_IF_V_SIZE_GT_V(V, ((HWY_TARGET <= HWY_SSSE3 &&
                                           hwy::IsFloat3264<TFromV<V>>())
                                              ? 32
                                              : sizeof(TFromV<V>)))>
HWY_API V AddSub(V a, V b) {
  using D = DFromV<decltype(a)>;
  using T = TFromD<D>;
  using TNegate = If<!hwy::IsSigned<T>(), MakeSigned<T>, T>;

  const D d;
  const Rebind<TNegate, D> d_negate;

  // Negate the even lanes of b
  const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b))));

  return Add(a, negated_even_b);
}

// ------------------------------ MaskedAddOr etc.
#if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_MASKED_ARITH
#undef HWY_NATIVE_MASKED_ARITH
#else
#define HWY_NATIVE_MASKED_ARITH
#endif

template <class V, class M>
HWY_API V MaskedMinOr(V no, M m, V a, V b) {
  return IfThenElse(m, Min(a, b), no);
}

template <class V, class M>
HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
  return IfThenElse(m, Max(a, b), no);
}

template <class V, class M>
HWY_API V MaskedAddOr(V no, M m, V a, V b) {
  return IfThenElse(m, Add(a, b), no);
}

template <class V, class M>
HWY_API V MaskedSubOr(V no, M m, V a, V b) {
  return IfThenElse(m, Sub(a, b), no);
}

template <class V, class M>
HWY_API V MaskedMulOr(V no, M m, V a, V b) {
  return IfThenElse(m, Mul(a, b), no);
}

template <class V, class M>
HWY_API V MaskedDivOr(V no, M m, V a, V b) {
  return IfThenElse(m, Div(a, b), no);
}

template <class V, class M>
HWY_API V MaskedModOr(V no, M m, V a, V b) {
  return IfThenElse(m, Mod(a, b), no);
}

template <class V, class M>
HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
  return IfThenElse(m, SaturatedAdd(a, b), no);
}

template <class V, class M>
HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
  return IfThenElse(m, SaturatedSub(a, b), no);
}
#endif  // HWY_NATIVE_MASKED_ARITH

// ------------------------------ IfNegativeThenNegOrUndefIfZero

#if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
     defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#else
#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#endif

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
  // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
  const auto zero = Zero(DFromV<V>());
  return MaskedSubOr(v, Lt(mask, zero), zero, v);
#else
  return IfNegativeThenElse(mask, Neg(v), v);
#endif
}

#endif  // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG

template <class V, HWY_IF_FLOAT_V(V)>
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
  return CopySign(v, Xor(mask, v));
}

// ------------------------------ SaturatedNeg

#if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
#undef HWY_NATIVE_SATURATED_NEG_8_16_32
#else
#define HWY_NATIVE_SATURATED_NEG_8_16_32
#endif

template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
          HWY_IF_SIGNED_V(V)>
HWY_API V SaturatedNeg(V v) {
  const DFromV<decltype(v)> d;
  return SaturatedSub(Zero(d), v);
}

template <class V, HWY_IF_I32(TFromV<V>)>
HWY_API V SaturatedNeg(V v) {
  const DFromV<decltype(v)> d;

#if HWY_TARGET == HWY_RVV ||                               \
    (HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \
    (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
  // RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions
  return SaturatedSub(Zero(d), v);
#else
  // ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
  // (v[i] > LimitsMin<int32_t>) ? (-v[i]) : LimitsMax<int32_t>() since
  // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and
  // ~LimitsMin<int32_t>() == LimitsMax<int32_t>().
  return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin<int32_t>()))));
#endif
}
#endif  // HWY_NATIVE_SATURATED_NEG_8_16_32

#if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SATURATED_NEG_64
#undef HWY_NATIVE_SATURATED_NEG_64
#else
#define HWY_NATIVE_SATURATED_NEG_64
#endif

template <class V, HWY_IF_I64(TFromV<V>)>
HWY_API V SaturatedNeg(V v) {
#if HWY_TARGET == HWY_RVV || \
    (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
  // RVV/NEON/SVE have native I64 SaturatedSub instructions
  const DFromV<decltype(v)> d;
  return SaturatedSub(Zero(d), v);
#else
  const auto neg_v = Neg(v);
  return Add(neg_v, BroadcastSignBit(And(v, neg_v)));
#endif
}
#endif  // HWY_NATIVE_SATURATED_NEG_64

// ------------------------------ SaturatedAbs

#if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SATURATED_ABS
#undef HWY_NATIVE_SATURATED_ABS
#else
#define HWY_NATIVE_SATURATED_ABS
#endif

template <class V, HWY_IF_SIGNED_V(V)>
HWY_API V SaturatedAbs(V v) {
  return Max(v, SaturatedNeg(v));
}

#endif

// ------------------------------ Reductions

// Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
// they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set.
// Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the
// SumOfLanes overloads. For the latter group, we here define the remaining
// overloads, plus ReduceSum which uses them plus GetLane.
#if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REDUCE_SCALAR
#undef HWY_NATIVE_REDUCE_SCALAR
#else
#define HWY_NATIVE_REDUCE_SCALAR
#endif

namespace detail {

// Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes.
struct AddFunc {
  template <class V>
  V operator()(V a, V b) const {
    return Add(a, b);
  }
};

struct MinFunc {
  template <class V>
  V operator()(V a, V b) const {
    return Min(a, b);
  }
};

struct MaxFunc {
  template <class V>
  V operator()(V a, V b) const {
    return Max(a, b);
  }
};

// No-op for vectors of at most one block.
template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) {
  return v;
}

// Reduces a lane with its counterpart in other block(s). Shared by AVX2 and
// WASM_EMU256. AVX3 has its own overload.
template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)>
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D /*d*/, Func f, VFromD<D> v) {
  return f(v, SwapAdjacentBlocks(v));
}

// These return the reduction result broadcasted across all lanes. They assume
// the caller has already reduced across blocks.

template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v10) {
  return f(v10, Reverse2(d, v10));
}

template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v3210) {
  const VFromD<D> v0123 = Reverse4(d, v3210);
  const VFromD<D> v03_12_12_03 = f(v3210, v0123);
  const VFromD<D> v12_03_03_12 = Reverse2(d, v03_12_12_03);
  return f(v03_12_12_03, v12_03_03_12);
}

template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v76543210) {
  // The upper half is reversed from the lower half; omit for brevity.
  const VFromD<D> v34_25_16_07 = f(v76543210, Reverse8(d, v76543210));
  const VFromD<D> v0347_1625_1625_0347 =
      f(v34_25_16_07, Reverse4(d, v34_25_16_07));
  return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
}

template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
  const RepartitionToWide<decltype(d)> dw;
  using VW = VFromD<decltype(dw)>;
  const VW vw = BitCast(dw, v);
  // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
  const VW even = And(vw, Set(dw, 0xFF));
  const VW odd = ShiftRight<8>(vw);
  const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
#if HWY_IS_LITTLE_ENDIAN
  return DupEven(BitCast(d, reduced));
#else
  return DupOdd(BitCast(d, reduced));
#endif
}

template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
  const RepartitionToWide<decltype(d)> dw;
  using VW = VFromD<decltype(dw)>;
  const VW vw = BitCast(dw, v);
  // Sign-extend
  // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
  const VW even = ShiftRight<8>(ShiftLeft<8>(vw));
  const VW odd = ShiftRight<8>(vw);
  const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
#if HWY_IS_LITTLE_ENDIAN
  return DupEven(BitCast(d, reduced));
#else
  return DupOdd(BitCast(d, reduced));
#endif
}

}  // namespace detail

template <class D, HWY_IF_SUM_OF_LANES_D(D)>
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
  const detail::AddFunc f;
  v = detail::ReduceAcrossBlocks(d, f, v);
  return detail::ReduceWithinBlocks(d, f, v);
}
template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
  const detail::MinFunc f;
  v = detail::ReduceAcrossBlocks(d, f, v);
  return detail::ReduceWithinBlocks(d, f, v);
}
template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
  const detail::MaxFunc f;
  v = detail::ReduceAcrossBlocks(d, f, v);
  return detail::ReduceWithinBlocks(d, f, v);
}

template <class D, HWY_IF_REDUCE_D(D)>
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
  return GetLane(SumOfLanes(d, v));
}
template <class D, HWY_IF_REDUCE_D(D)>
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
  return GetLane(MinOfLanes(d, v));
}
template <class D, HWY_IF_REDUCE_D(D)>
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
  return GetLane(MaxOfLanes(d, v));
}

#endif  // HWY_NATIVE_REDUCE_SCALAR

// Corner cases for both generic and native implementations:
// N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm)
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
  return GetLane(v);
}
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API TFromD<D> ReduceMin(D /*d*/, VFromD<D> v) {
  return GetLane(v);
}
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API TFromD<D> ReduceMax(D /*d*/, VFromD<D> v) {
  return GetLane(v);
}

template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
  return v;
}
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
  return v;
}
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
  return v;
}

// N=4 for 8-bit is still less than the minimum native size.

// ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8
// ReduceSum operations
#if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
#else
#define HWY_NATIVE_REDUCE_SUM_4_UI8
#endif
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
  const Twice<RepartitionToWide<decltype(d)>> dw;
  return static_cast<TFromD<D>>(ReduceSum(dw, PromoteTo(dw, v)));
}
#endif  // HWY_NATIVE_REDUCE_SUM_4_UI8

// RVV/SVE have target-specific implementations of the N=4 I8/U8
// ReduceMin/ReduceMax operations
#if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
#else
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
#endif
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
  const Twice<RepartitionToWide<decltype(d)>> dw;
  return static_cast<TFromD<D>>(ReduceMin(dw, PromoteTo(dw, v)));
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
  const Twice<RepartitionToWide<decltype(d)>> dw;
  return static_cast<TFromD<D>>(ReduceMax(dw, PromoteTo(dw, v)));
}
#endif  // HWY_NATIVE_REDUCE_MINMAX_4_UI8

// ------------------------------ IsInf, IsFinite

// AVX3 has target-specific implementations of these.
#if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ISINF
#undef HWY_NATIVE_ISINF
#else
#define HWY_NATIVE_ISINF
#endif

template <class V, class D = DFromV<V>>
HWY_API MFromD<D> IsInf(const V v) {
  using T = TFromD<D>;
  const D d;
  const RebindToUnsigned<decltype(d)> du;
  const VFromD<decltype(du)> vu = BitCast(du, v);
  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
  return RebindMask(
      d,
      Eq(Add(vu, vu),
         Set(du, static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>()))));
}

// Returns whether normal/subnormal/zero.
template <class V, class D = DFromV<V>>
HWY_API MFromD<D> IsFinite(const V v) {
  using T = TFromD<D>;
  const D d;
  const RebindToUnsigned<decltype(d)> du;
  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
  const VFromD<decltype(du)> vu = BitCast(du, v);
// 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code
// for AVX2 if we instead add vu + vu.
#if HWY_COMPILER_MSVC
  const VFromD<decltype(du)> shl = ShiftLeft<1>(vu);
#else
  const VFromD<decltype(du)> shl = Add(vu, vu);
#endif

  // Then shift right so we can compare with the max exponent (cannot compare
  // with MaxExponentTimes2 directly because it is negative and non-negative
  // floats would be greater).
  const VFromD<decltype(di)> exp =
      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl));
  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
}

#endif  // HWY_NATIVE_ISINF

// ------------------------------ LoadInterleaved2

#if HWY_IDE || \
    (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#else
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
#endif

template <class D, HWY_IF_LANES_GT_D(D, 1)>
HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1) {
  const VFromD<D> A = LoadU(d, unaligned);  // v1[1] v0[1] v1[0] v0[0]
  const VFromD<D> B = LoadU(d, unaligned + Lanes(d));
  v0 = ConcatEven(d, B, A);
  v1 = ConcatOdd(d, B, A);
}

template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1) {
  v0 = LoadU(d, unaligned + 0);
  v1 = LoadU(d, unaligned + 1);
}

// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)

namespace detail {

#if HWY_IDE
template <class V>
HWY_INLINE V ShuffleTwo1230(V a, V /* b */) {
  return a;
}
template <class V>
HWY_INLINE V ShuffleTwo2301(V a, V /* b */) {
  return a;
}
template <class V>
HWY_INLINE V ShuffleTwo3012(V a, V /* b */) {
  return a;
}
#endif  // HWY_IDE

// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void LoadTransposedBlocks3(D d,
                                      const TFromD<D>* HWY_RESTRICT unaligned,
                                      VFromD<D>& A, VFromD<D>& B,
                                      VFromD<D>& C) {
  constexpr size_t kN = MaxLanes(d);
  A = LoadU(d, unaligned + 0 * kN);
  B = LoadU(d, unaligned + 1 * kN);
  C = LoadU(d, unaligned + 2 * kN);
}

}  // namespace detail

template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  const RebindToUnsigned<decltype(d)> du;
  using V = VFromD<D>;
  using VU = VFromD<decltype(du)>;
  // Compact notation so these fit on one line: 12 := v1[2].
  V A;  // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
  V B;  // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
  V C;  // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
  // Compress all lanes belonging to v0 into consecutive lanes.
  constexpr uint8_t Z = 0x80;
  const VU idx_v0A =
      Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  const VU idx_v0B =
      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z);
  const VU idx_v0C =
      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13);
  const VU idx_v1A =
      Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  const VU idx_v1B =
      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z);
  const VU idx_v1C =
      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14);
  const VU idx_v2A =
      Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  const VU idx_v2B =
      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z);
  const VU idx_v2C =
      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15);
  const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
  const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
  const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
  const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
  const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
  const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
  const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
  const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
  const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
  v0 = Xor3(v0L, v0M, v0U);
  v1 = Xor3(v1L, v1M, v1U);
  v2 = Xor3(v2L, v2M, v2U);
}

// 8-bit lanes x8
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  const RebindToUnsigned<decltype(d)> du;
  using V = VFromD<D>;
  using VU = VFromD<decltype(du)>;
  V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
  V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
  V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
  // Compress all lanes belonging to v0 into consecutive lanes.
  constexpr uint8_t Z = 0x80;
  const VU idx_v0A =
      Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  const VU idx_v0B =
      Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  const VU idx_v0C =
      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0);
  const VU idx_v1A =
      Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  const VU idx_v1B =
      Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  const VU idx_v1C =
      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0);
  const VU idx_v2A =
      Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  const VU idx_v2B =
      Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  const VU idx_v2C =
      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
  const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
  const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
  const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
  const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
  const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
  const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
  const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
  const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
  const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
  v0 = Xor3(v0L, v0M, v0U);
  v1 = Xor3(v1L, v1M, v1U);
  v2 = Xor3(v2L, v2M, v2U);
}

// 16-bit lanes x8
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  const RebindToUnsigned<decltype(d)> du;
  const Repartition<uint8_t, decltype(du)> du8;
  using V = VFromD<D>;
  using VU8 = VFromD<decltype(du8)>;
  V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
  V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
  V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
  // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
  // but each element of the array contains a byte index for a byte of a lane.
  constexpr uint8_t Z = 0x80;
  const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C,
                                          0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03,
                                          0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z);
  const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
                                          Z, 0x04, 0x05, 0x0A, 0x0B);
  const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E,
                                          0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05,
                                          0x0A, 0x0B, Z, Z, Z, Z, Z, Z);
  const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
                                          0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D);
  const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z,
                                          Z, Z, Z, Z, Z, Z, Z, Z, Z);
  const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06,
                                          0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z);
  const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
                                          0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F);
  const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A));
  const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B));
  const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C));
  const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A));
  const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B));
  const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C));
  const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A));
  const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B));
  const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C));
  v0 = Xor3(v0L, v0M, v0U);
  v1 = Xor3(v1L, v1M, v1U);
  v2 = Xor3(v2L, v2M, v2U);
}

template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  using V = VFromD<D>;
  V A;  // v0[1] v2[0] v1[0] v0[0]
  V B;  // v1[2] v0[2] v2[1] v1[1]
  V C;  // v2[3] v1[3] v0[3] v2[2]
  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);

  const V vxx_02_03_xx = OddEven(C, B);
  v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx);

  // Shuffle2301 takes the upper/lower halves of the output from one input, so
  // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
  // OddEven because it may have higher throughput than Shuffle.
  const V vxx_xx_10_11 = OddEven(A, B);
  const V v12_13_xx_xx = OddEven(B, C);
  v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx);

  const V vxx_20_21_xx = OddEven(B, A);
  v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C);
}

template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  VFromD<D> A;  // v1[0] v0[0]
  VFromD<D> B;  // v0[1] v2[0]
  VFromD<D> C;  // v2[1] v1[1]
  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
  v0 = OddEven(B, A);
  v1 = CombineShiftRightBytes<sizeof(TFromD<D>)>(d, C, A);
  v2 = OddEven(C, B);
}

template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  v0 = LoadU(d, unaligned + 0);
  v1 = LoadU(d, unaligned + 1);
  v2 = LoadU(d, unaligned + 2);
}

// ------------------------------ LoadInterleaved4

namespace detail {

// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void LoadTransposedBlocks4(D d,
                                      const TFromD<D>* HWY_RESTRICT unaligned,
                                      VFromD<D>& vA, VFromD<D>& vB,
                                      VFromD<D>& vC, VFromD<D>& vD) {
  constexpr size_t kN = MaxLanes(d);
  vA = LoadU(d, unaligned + 0 * kN);
  vB = LoadU(d, unaligned + 1 * kN);
  vC = LoadU(d, unaligned + 2 * kN);
  vD = LoadU(d, unaligned + 3 * kN);
}

}  // namespace detail

template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
                              VFromD<D>& v3) {
  const Repartition<uint64_t, decltype(d)> d64;
  using V64 = VFromD<decltype(d64)>;
  using V = VFromD<D>;
  // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD.
  // Here int[i] means the four interleaved values of the i-th 4-tuple and
  // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
  V vA;  // int[13..10] int[3..0]
  V vB;  // int[17..14] int[7..4]
  V vC;  // int[1b..18] int[b..8]
  V vD;  // int[1f..1c] int[f..c]
  detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);

  // For brevity, the comments only list the lower block (upper = lower + 0x10)
  const V v5140 = InterleaveLower(d, vA, vB);  // int[5,1,4,0]
  const V vd9c8 = InterleaveLower(d, vC, vD);  // int[d,9,c,8]
  const V v7362 = InterleaveUpper(d, vA, vB);  // int[7,3,6,2]
  const V vfbea = InterleaveUpper(d, vC, vD);  // int[f,b,e,a]

  const V v6420 = InterleaveLower(d, v5140, v7362);  // int[6,4,2,0]
  const V veca8 = InterleaveLower(d, vd9c8, vfbea);  // int[e,c,a,8]
  const V v7531 = InterleaveUpper(d, v5140, v7362);  // int[7,5,3,1]
  const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea);  // int[f,d,b,9]

  const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531));  // v10[7..0]
  const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9));  // v10[f..8]
  const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531));  // v32[7..0]
  const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9));  // v32[f..8]

  v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
  v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
  v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
  v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
}

template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
                              VFromD<D>& v3) {
  // In the last step, we interleave by half of the block size, which is usually
  // 8 bytes but half that for 8-bit x8 vectors.
  using TW = hwy::UnsignedFromSize<d.MaxBytes() == 8 ? 4 : 8>;
  const Repartition<TW, decltype(d)> dw;
  using VW = VFromD<decltype(dw)>;

  // (Comments are for 256-bit vectors.)
  // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD.
  VFromD<D> vA;  // v3210[9]v3210[8] v3210[1]v3210[0]
  VFromD<D> vB;  // v3210[b]v3210[a] v3210[3]v3210[2]
  VFromD<D> vC;  // v3210[d]v3210[c] v3210[5]v3210[4]
  VFromD<D> vD;  // v3210[f]v3210[e] v3210[7]v3210[6]
  detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);

  const VFromD<D> va820 = InterleaveLower(d, vA, vB);  // v3210[a,8] v3210[2,0]
  const VFromD<D> vec64 = InterleaveLower(d, vC, vD);  // v3210[e,c] v3210[6,4]
  const VFromD<D> vb931 = InterleaveUpper(d, vA, vB);  // v3210[b,9] v3210[3,1]
  const VFromD<D> vfd75 = InterleaveUpper(d, vC, vD);  // v3210[f,d] v3210[7,5]

  const VW v10_b830 =  // v10[b..8] v10[3..0]
      BitCast(dw, InterleaveLower(d, va820, vb931));
  const VW v10_fc74 =  // v10[f..c] v10[7..4]
      BitCast(dw, InterleaveLower(d, vec64, vfd75));
  const VW v32_b830 =  // v32[b..8] v32[3..0]
      BitCast(dw, InterleaveUpper(d, va820, vb931));
  const VW v32_fc74 =  // v32[f..c] v32[7..4]
      BitCast(dw, InterleaveUpper(d, vec64, vfd75));

  v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
  v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
  v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
  v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
}

template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
                              VFromD<D>& v3) {
  using V = VFromD<D>;
  V vA;  // v3210[4] v3210[0]
  V vB;  // v3210[5] v3210[1]
  V vC;  // v3210[6] v3210[2]
  V vD;  // v3210[7] v3210[3]
  detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
  const V v10e = InterleaveLower(d, vA, vC);  // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
  const V v10o = InterleaveLower(d, vB, vD);  // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
  const V v32e = InterleaveUpper(d, vA, vC);  // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
  const V v32o = InterleaveUpper(d, vB, vD);  // v3[7,5] v2[7,5] v3[3,1] v2[3,1]

  v0 = InterleaveLower(d, v10e, v10o);
  v1 = InterleaveUpper(d, v10e, v10o);
  v2 = InterleaveLower(d, v32e, v32o);
  v3 = InterleaveUpper(d, v32e, v32o);
}

template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
                              VFromD<D>& v3) {
  VFromD<D> vA, vB, vC, vD;
  detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
  v0 = InterleaveLower(d, vA, vC);
  v1 = InterleaveUpper(d, vA, vC);
  v2 = InterleaveLower(d, vB, vD);
  v3 = InterleaveUpper(d, vB, vD);
}

// Any T x1
template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
                              VFromD<D>& v3) {
  v0 = LoadU(d, unaligned + 0);
  v1 = LoadU(d, unaligned + 1);
  v2 = LoadU(d, unaligned + 2);
  v3 = LoadU(d, unaligned + 3);
}

// ------------------------------ StoreInterleaved2

namespace detail {

// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void StoreTransposedBlocks2(VFromD<D> A, VFromD<D> B, D d,
                                       TFromD<D>* HWY_RESTRICT unaligned) {
  constexpr size_t kN = MaxLanes(d);
  StoreU(A, d, unaligned + 0 * kN);
  StoreU(B, d, unaligned + 1 * kN);
}

}  // namespace detail

// >= 128 bit vector
template <class D, HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  const auto v10L = InterleaveLower(d, v0, v1);  // .. v1[0] v0[0]
  const auto v10U = InterleaveUpper(d, v0, v1);  // .. v1[kN/2] v0[kN/2]
  detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
}

// <= 64 bits
template <class V, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API void StoreInterleaved2(V part0, V part1, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  const Twice<decltype(d)> d2;
  const auto v0 = ZeroExtendVector(d2, part0);
  const auto v1 = ZeroExtendVector(d2, part1);
  const auto v10 = InterleaveLower(d2, v0, v1);
  StoreU(v10, d2, unaligned);
}

// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
// TableLookupBytes)

namespace detail {

// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void StoreTransposedBlocks3(VFromD<D> A, VFromD<D> B, VFromD<D> C,
                                       D d, TFromD<D>* HWY_RESTRICT unaligned) {
  constexpr size_t kN = MaxLanes(d);
  StoreU(A, d, unaligned + 0 * kN);
  StoreU(B, d, unaligned + 1 * kN);
  StoreU(C, d, unaligned + 2 * kN);
}

}  // namespace detail

// >= 128-bit vector, 8-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;
  const auto k5 = Set(du, TU{5});
  const auto k6 = Set(du, TU{6});

  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
  // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
  // to their place, with 0x80 so lanes to be filled from other vectors are 0
  // to enable blending by ORing together.
  const VFromD<decltype(du)> shuf_A0 =
      Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3,
                          0x80, 0x80, 4, 0x80, 0x80, 5);
  // Cannot reuse shuf_A0 because it contains 5.
  const VFromD<decltype(du)> shuf_A1 =
      Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
                          3, 0x80, 0x80, 4, 0x80, 0x80);
  // The interleaved vectors will be named A, B, C; temporaries with suffix
  // 0..2 indicate which input vector's lanes they hold.
  // cannot reuse shuf_A0 (has 5)
  const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // 5..4..3..2..1..0
  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // ..4..3..2..1..0.
  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // .4..3..2..1..0..
  const VFromD<D> A = BitCast(d, A0 | A1 | A2);

  // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
  const auto shuf_B0 = shuf_A2 + k6;  // .A..9..8..7..6..
  const auto shuf_B1 = shuf_A0 + k5;  // A..9..8..7..6..5
  const auto shuf_B2 = shuf_A1 + k5;  // ..9..8..7..6..5.
  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
  const VFromD<D> B = BitCast(d, B0 | B1 | B2);

  // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
  const auto shuf_C0 = shuf_B2 + k6;  // ..F..E..D..C..B.
  const auto shuf_C1 = shuf_B0 + k5;  // .F..E..D..C..B..
  const auto shuf_C2 = shuf_B1 + k5;  // F..E..D..C..B..A
  const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
  const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
  const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
  const VFromD<D> C = BitCast(d, C0 | C1 | C2);

  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
}

// >= 128-bit vector, 16-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  const Repartition<uint8_t, decltype(d)> du8;
  const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
  const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});

  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
  // filled from other vectors are 0 for blending. Note that these are byte
  // indices for 16-bit lanes.
  const VFromD<decltype(du8)> shuf_A1 =
      Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3,
                          0x80, 0x80, 0x80, 0x80, 4, 5);
  const VFromD<decltype(du8)> shuf_A2 =
      Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80,
                          0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);

  // The interleaved vectors will be named A, B, C; temporaries with suffix
  // 0..2 indicate which input vector's lanes they hold.
  const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);

  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
  const VFromD<D> A = BitCast(d, A0 | A1 | A2);

  // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
  const auto shuf_B0 = shuf_A1 + k3;  // 5..4..3.
  const auto shuf_B1 = shuf_A2 + k3;  // ..4..3..
  const auto shuf_B2 = shuf_A0 + k2;  // .4..3..2
  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
  const VFromD<D> B = BitCast(d, B0 | B1 | B2);

  // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
  const auto shuf_C0 = shuf_B1 + k3;  // ..7..6..
  const auto shuf_C1 = shuf_B2 + k3;  // .7..6..5
  const auto shuf_C2 = shuf_B0 + k2;  // 7..6..5.
  const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
  const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
  const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
  const VFromD<D> C = BitCast(d, C0 | C1 | C2);

  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
}

// >= 128-bit vector, 32-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  const RepartitionToWide<decltype(d)> dw;

  const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
  const VFromD<D> v01_v20 = OddEven(v0, v2);
  // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
  const VFromD<D> A = BitCast(
      d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));

  const VFromD<D> v1_321 = ShiftRightLanes<1>(d, v1);
  const VFromD<D> v0_32 = ShiftRightLanes<2>(d, v0);
  const VFromD<D> v21_v11 = OddEven(v2, v1_321);
  const VFromD<D> v12_v02 = OddEven(v1_321, v0_32);
  // B: v1[2],v0[2], v2[1],v1[1]
  const VFromD<D> B = BitCast(
      d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));

  // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
  const VFromD<D> v23_v13 = OddEven(v2, v1_321);
  const VFromD<D> v03_v22 = OddEven(v0, v2);
  // C: v2[3],v1[3],v0[3], v2[2]
  const VFromD<D> C = BitCast(
      d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));

  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
}

// >= 128-bit vector, 64-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  const VFromD<D> A = InterleaveLower(d, v0, v1);
  const VFromD<D> B = OddEven(v0, v2);
  const VFromD<D> C = InterleaveUpper(d, v1, v2);
  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
}

// 64-bit vector, 8-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
                               VFromD<D> part2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  // Use full vectors for the shuffles and first result.
  constexpr size_t kFullN = 16 / sizeof(TFromD<D>);
  const Full128<uint8_t> du;
  const Full128<TFromD<D>> d_full;
  const auto k5 = Set(du, uint8_t{5});
  const auto k6 = Set(du, uint8_t{6});

  const VFromD<decltype(d_full)> v0{part0.raw};
  const VFromD<decltype(d_full)> v1{part1.raw};
  const VFromD<decltype(d_full)> v2{part2.raw};

  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
  // filled from other vectors are 0 for blending.
  alignas(16) static constexpr uint8_t tbl_v0[16] = {
      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
  alignas(16) static constexpr uint8_t tbl_v1[16] = {
      0x80, 0, 0x80, 0x80, 1, 0x80,  //
      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
  // The interleaved vectors will be named A, B, C; temporaries with suffix
  // 0..2 indicate which input vector's lanes they hold.
  const auto shuf_A0 = Load(du, tbl_v0);
  const auto shuf_A1 = Load(du, tbl_v1);  // cannot reuse shuf_A0 (5 in MSB)
  const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // 5..4..3..2..1..0
  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // ..4..3..2..1..0.
  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // .4..3..2..1..0..
  const auto A = BitCast(d_full, A0 | A1 | A2);
  StoreU(A, d_full, unaligned + 0 * kFullN);

  // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
  const auto shuf_B0 = shuf_A2 + k6;  // ..7..6..
  const auto shuf_B1 = shuf_A0 + k5;  // .7..6..5
  const auto shuf_B2 = shuf_A1 + k5;  // 7..6..5.
  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
  const VFromD<D> B{BitCast(d_full, B0 | B1 | B2).raw};
  StoreU(B, d, unaligned + 1 * kFullN);
}

// 64-bit vector, 16-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)>
HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
                               VFromD<D> part2, D dh,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  const Twice<D> d_full;
  const Full128<uint8_t> du8;
  const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
  const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});

  const VFromD<decltype(d_full)> v0{part0.raw};
  const VFromD<decltype(d_full)> v1{part1.raw};
  const VFromD<decltype(d_full)> v2{part2.raw};

  // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
  // to their place, with 0x80 so lanes to be filled from other vectors are 0
  // to enable blending by ORing together.
  alignas(16) static constexpr uint8_t tbl_v1[16] = {
      0x80, 0x80, 0,    1,    0x80, 0x80, 0x80, 0x80,
      2,    3,    0x80, 0x80, 0x80, 0x80, 4,    5};
  alignas(16) static constexpr uint8_t tbl_v2[16] = {
      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};

  // The interleaved vectors will be named A, B; temporaries with suffix
  // 0..2 indicate which input vector's lanes they hold.
  const auto shuf_A1 = Load(du8, tbl_v1);  // 2..1..0.
                                           // .2..1..0
  const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
  const auto shuf_A2 = Load(du8, tbl_v2);  // ..1..0..

  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
  const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
  StoreU(A, d_full, unaligned);

  // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
  const auto shuf_B0 = shuf_A1 + k3;  // ..3.
  const auto shuf_B1 = shuf_A2 + k3;  // .3..
  const auto shuf_B2 = shuf_A0 + k2;  // 3..2
  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
  const VFromD<decltype(d_full)> B = BitCast(d_full, B0 | B1 | B2);
  StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full));
}

// 64-bit vector, 32-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  // (same code as 128-bit vector, 64-bit lanes)
  const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
  const VFromD<D> v01_v20 = OddEven(v0, v2);
  const VFromD<D> v21_v11 = InterleaveUpper(d, v1, v2);
  constexpr size_t kN = MaxLanes(d);
  StoreU(v10_v00, d, unaligned + 0 * kN);
  StoreU(v01_v20, d, unaligned + 1 * kN);
  StoreU(v21_v11, d, unaligned + 2 * kN);
}

// 64-bit lanes are handled by the N=1 case below.

// <= 32-bit vector, 8-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4),
          HWY_IF_LANES_GT_D(D, 1)>
HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
                               VFromD<D> part2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  // Use full vectors for the shuffles and result.
  const Full128<uint8_t> du;
  const Full128<TFromD<D>> d_full;

  const VFromD<decltype(d_full)> v0{part0.raw};
  const VFromD<decltype(d_full)> v1{part1.raw};
  const VFromD<decltype(d_full)> v2{part2.raw};

  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
  // so lanes to be filled from other vectors are 0 to enable blending by ORing
  // together.
  alignas(16) static constexpr uint8_t tbl_v0[16] = {
      0,    0x80, 0x80, 1,    0x80, 0x80, 2,    0x80,
      0x80, 3,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
  // The interleaved vector will be named A; temporaries with suffix
  // 0..2 indicate which input vector's lanes they hold.
  const auto shuf_A0 = Load(du, tbl_v0);
  const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
  const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // ......3..2..1..0
  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // .....3..2..1..0.
  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // ....3..2..1..0..
  const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
  alignas(16) TFromD<D> buf[MaxLanes(d_full)];
  StoreU(A, d_full, buf);
  CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
}

// 32-bit vector, 16-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)>
HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
                               VFromD<D> part2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  // Use full vectors for the shuffles and result.
  const Full128<uint8_t> du8;
  const Full128<TFromD<D>> d_full;

  const VFromD<decltype(d_full)> v0{part0.raw};
  const VFromD<decltype(d_full)> v1{part1.raw};
  const VFromD<decltype(d_full)> v2{part2.raw};

  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
  // so lanes to be filled from other vectors are 0 to enable blending by ORing
  // together.
  alignas(16) static constexpr uint8_t tbl_v2[16] = {
      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
  // The interleaved vector will be named A; temporaries with suffix
  // 0..2 indicate which input vector's lanes they hold.
  const auto shuf_A2 =  // ..1..0..
      Load(du8, tbl_v2);
  const auto shuf_A1 =  // ...1..0.
      CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
  const auto shuf_A0 =  // ....1..0
      CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // ..1..0
  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // .1..0.
  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // 1..0..
  const auto A = BitCast(d_full, A0 | A1 | A2);
  alignas(16) TFromD<D> buf[MaxLanes(d_full)];
  StoreU(A, d_full, buf);
  CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
}

// Single-element vector, any lane size: just store directly
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  StoreU(v0, d, unaligned + 0);
  StoreU(v1, d, unaligned + 1);
  StoreU(v2, d, unaligned + 2);
}

// ------------------------------ StoreInterleaved4

namespace detail {

// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void StoreTransposedBlocks4(VFromD<D> vA, VFromD<D> vB, VFromD<D> vC,
                                       VFromD<D> vD, D d,
                                       TFromD<D>* HWY_RESTRICT unaligned) {
  constexpr size_t kN = MaxLanes(d);
  StoreU(vA, d, unaligned + 0 * kN);
  StoreU(vB, d, unaligned + 1 * kN);
  StoreU(vC, d, unaligned + 2 * kN);
  StoreU(vD, d, unaligned + 3 * kN);
}

}  // namespace detail

// >= 128-bit vector, 8..32-bit lanes
template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
                               VFromD<D> v3, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  const RepartitionToWide<decltype(d)> dw;
  const auto v10L = ZipLower(dw, v0, v1);  // .. v1[0] v0[0]
  const auto v32L = ZipLower(dw, v2, v3);
  const auto v10U = ZipUpper(dw, v0, v1);
  const auto v32U = ZipUpper(dw, v2, v3);
  // The interleaved vectors are vA, vB, vC, vD.
  const VFromD<D> vA = BitCast(d, InterleaveLower(dw, v10L, v32L));  // 3210
  const VFromD<D> vB = BitCast(d, InterleaveUpper(dw, v10L, v32L));
  const VFromD<D> vC = BitCast(d, InterleaveLower(dw, v10U, v32U));
  const VFromD<D> vD = BitCast(d, InterleaveUpper(dw, v10U, v32U));
  detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
}

// >= 128-bit vector, 64-bit lanes
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
                               VFromD<D> v3, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  // The interleaved vectors are vA, vB, vC, vD.
  const VFromD<D> vA = InterleaveLower(d, v0, v1);  // v1[0] v0[0]
  const VFromD<D> vB = InterleaveLower(d, v2, v3);
  const VFromD<D> vC = InterleaveUpper(d, v0, v1);
  const VFromD<D> vD = InterleaveUpper(d, v2, v3);
  detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
}

// 64-bit vector, 8..32-bit lanes
template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
                               VFromD<D> part2, VFromD<D> part3, D /* tag */,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  // Use full vectors to reduce the number of stores.
  const Full128<TFromD<D>> d_full;
  const RepartitionToWide<decltype(d_full)> dw;
  const VFromD<decltype(d_full)> v0{part0.raw};
  const VFromD<decltype(d_full)> v1{part1.raw};
  const VFromD<decltype(d_full)> v2{part2.raw};
  const VFromD<decltype(d_full)> v3{part3.raw};
  const auto v10 = ZipLower(dw, v0, v1);  // v1[0] v0[0]
  const auto v32 = ZipLower(dw, v2, v3);
  const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
  const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
  StoreU(A, d_full, unaligned);
  StoreU(B, d_full, unaligned + MaxLanes(d_full));
}

// 64-bit vector, 64-bit lane
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
                               VFromD<D> part2, VFromD<D> part3, D /* tag */,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  // Use full vectors to reduce the number of stores.
  const Full128<TFromD<D>> d_full;
  const VFromD<decltype(d_full)> v0{part0.raw};
  const VFromD<decltype(d_full)> v1{part1.raw};
  const VFromD<decltype(d_full)> v2{part2.raw};
  const VFromD<decltype(d_full)> v3{part3.raw};
  const auto A = InterleaveLower(d_full, v0, v1);  // v1[0] v0[0]
  const auto B = InterleaveLower(d_full, v2, v3);
  StoreU(A, d_full, unaligned);
  StoreU(B, d_full, unaligned + MaxLanes(d_full));
}

// <= 32-bit vectors
template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
                               VFromD<D> part2, VFromD<D> part3, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  // Use full vectors to reduce the number of stores.
  const Full128<TFromD<D>> d_full;
  const RepartitionToWide<decltype(d_full)> dw;
  const VFromD<decltype(d_full)> v0{part0.raw};
  const VFromD<decltype(d_full)> v1{part1.raw};
  const VFromD<decltype(d_full)> v2{part2.raw};
  const VFromD<decltype(d_full)> v3{part3.raw};
  const auto v10 = ZipLower(dw, v0, v1);  // .. v1[0] v0[0]
  const auto v32 = ZipLower(dw, v2, v3);
  const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
  alignas(16) TFromD<D> buf[MaxLanes(d_full)];
  StoreU(v3210, d_full, buf);
  CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
}

#endif  // HWY_NATIVE_LOAD_STORE_INTERLEAVED

// ------------------------------ LoadN

#if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE))

#ifdef HWY_NATIVE_LOAD_N
#undef HWY_NATIVE_LOAD_N
#else
#define HWY_NATIVE_LOAD_N
#endif

#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
namespace detail {

template <class DTo, class DFrom>
HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
                                          VFromD<DFrom> v) {
#if HWY_TARGET <= HWY_SSE2
  // On SSE2/SSSE3/SSE4, the LoadU operation will zero out any lanes of v.raw
  // past the first (lowest-index) Lanes(d_from) lanes of v.raw if
  // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true
  (void)d_from;
  return ResizeBitCast(d_to, v);
#else
  // On other targets such as PPC/NEON, the contents of any lanes past the first
  // (lowest-index) Lanes(d_from) lanes of v.raw might be non-zero if
  // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true.
  return ZeroExtendResizeBitCast(d_to, d_from, v);
#endif
}

}  // namespace detail

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t num_lanes) {
  return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t num_lanes) {
  return (num_lanes > 0) ? LoadU(d, p) : no;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t num_lanes) {
  const FixedTag<TFromD<D>, 1> d1;

  if (num_lanes >= 2) return LoadU(d, p);
  if (num_lanes == 0) return Zero(d);
  return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t num_lanes) {
  const FixedTag<TFromD<D>, 1> d1;

  if (num_lanes >= 2) return LoadU(d, p);
  if (num_lanes == 0) return no;
  return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t num_lanes) {
  const FixedTag<TFromD<D>, 2> d2;
  const Half<decltype(d2)> d1;

  if (num_lanes >= 4) return LoadU(d, p);
  if (num_lanes == 0) return Zero(d);
  if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));

  // Two or three lanes.
  const VFromD<D> v_lo = detail::LoadNResizeBitCast(d, d2, LoadU(d2, p));
  return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t num_lanes) {
  const FixedTag<TFromD<D>, 2> d2;

  if (num_lanes >= 4) return LoadU(d, p);
  if (num_lanes == 0) return no;
  if (num_lanes == 1) return InsertLane(no, 0, p[0]);

  // Two or three lanes.
  const VFromD<D> v_lo =
      ConcatUpperLower(d, no, ResizeBitCast(d, LoadU(d2, p)));
  return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t num_lanes) {
  const FixedTag<TFromD<D>, 4> d4;
  const Half<decltype(d4)> d2;
  const Half<decltype(d2)> d1;

  if (num_lanes >= 8) return LoadU(d, p);
  if (num_lanes == 0) return Zero(d);
  if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));

  const size_t leading_len = num_lanes & 4;
  VFromD<decltype(d4)> v_trailing = Zero(d4);

  if ((num_lanes & 2) != 0) {
    const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
    if ((num_lanes & 1) != 0) {
      v_trailing = Combine(
          d4,
          detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
          v_trailing_lo2);
    } else {
      v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
    }
  } else if ((num_lanes & 1) != 0) {
    v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
  }

  if (leading_len != 0) {
    return Combine(d, v_trailing, LoadU(d4, p));
  } else {
    return detail::LoadNResizeBitCast(d, d4, v_trailing);
  }
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t num_lanes) {
  const FixedTag<TFromD<D>, 4> d4;
  const Half<decltype(d4)> d2;
  const Half<decltype(d2)> d1;

  if (num_lanes >= 8) return LoadU(d, p);
  if (num_lanes == 0) return no;
  if (num_lanes == 1) return InsertLane(no, 0, p[0]);

  const size_t leading_len = num_lanes & 4;
  VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);

  if ((num_lanes & 2) != 0) {
    const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
    if ((num_lanes & 1) != 0) {
      v_trailing = Combine(
          d4,
          InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
                          ResizeBitCast(d2, no)),
          v_trailing_lo2);
    } else {
      v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
                                    ResizeBitCast(d4, v_trailing_lo2));
    }
  } else if ((num_lanes & 1) != 0) {
    v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
  }

  if (leading_len != 0) {
    return Combine(d, v_trailing, LoadU(d4, p));
  } else {
    return ConcatUpperLower(d, no, ResizeBitCast(d, v_trailing));
  }
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t num_lanes) {
  const FixedTag<TFromD<D>, 8> d8;
  const Half<decltype(d8)> d4;
  const Half<decltype(d4)> d2;
  const Half<decltype(d2)> d1;

  if (num_lanes >= 16) return LoadU(d, p);
  if (num_lanes == 0) return Zero(d);
  if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));

  const size_t leading_len = num_lanes & 12;
  VFromD<decltype(d4)> v_trailing = Zero(d4);

  if ((num_lanes & 2) != 0) {
    const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
    if ((num_lanes & 1) != 0) {
      v_trailing = Combine(
          d4,
          detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
          v_trailing_lo2);
    } else {
      v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
    }
  } else if ((num_lanes & 1) != 0) {
    v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
  }

  if (leading_len != 0) {
    if (leading_len >= 8) {
      const VFromD<decltype(d8)> v_hi7 =
          ((leading_len & 4) != 0)
              ? Combine(d8, v_trailing, LoadU(d4, p + 8))
              : detail::LoadNResizeBitCast(d8, d4, v_trailing);
      return Combine(d, v_hi7, LoadU(d8, p));
    } else {
      return detail::LoadNResizeBitCast(d, d8,
                                        Combine(d8, v_trailing, LoadU(d4, p)));
    }
  } else {
    return detail::LoadNResizeBitCast(d, d4, v_trailing);
  }
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
          HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t num_lanes) {
  const FixedTag<TFromD<D>, 8> d8;
  const Half<decltype(d8)> d4;
  const Half<decltype(d4)> d2;
  const Half<decltype(d2)> d1;

  if (num_lanes >= 16) return LoadU(d, p);
  if (num_lanes == 0) return no;
  if (num_lanes == 1) return InsertLane(no, 0, p[0]);

  const size_t leading_len = num_lanes & 12;
  VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);

  if ((num_lanes & 2) != 0) {
    const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
    if ((num_lanes & 1) != 0) {
      v_trailing = Combine(
          d4,
          InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
                          ResizeBitCast(d2, no)),
          v_trailing_lo2);
    } else {
      v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
                                    ResizeBitCast(d4, v_trailing_lo2));
    }
  } else if ((num_lanes & 1) != 0) {
    v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
  }

  if (leading_len != 0) {
    if (leading_len >= 8) {
      const VFromD<decltype(d8)> v_hi7 =
          ((leading_len & 4) != 0)
              ? Combine(d8, v_trailing, LoadU(d4, p + 8))
              : ConcatUpperLower(d8, ResizeBitCast(d8, no),
                                 ResizeBitCast(d8, v_trailing));
      return Combine(d, v_hi7, LoadU(d8, p));
    } else {
      return ConcatUpperLower(
          d, ResizeBitCast(d, no),
          ResizeBitCast(d, Combine(d8, v_trailing, LoadU(d4, p))));
    }
  } else {
    const Repartition<uint32_t, D> du32;
    // lowest 4 bytes from v_trailing, next 4 from no.
    const VFromD<decltype(du32)> lo8 =
        InterleaveLower(ResizeBitCast(du32, v_trailing), BitCast(du32, no));
    return ConcatUpperLower(d, ResizeBitCast(d, no), ResizeBitCast(d, lo8));
  }
}

#if HWY_MAX_BYTES >= 32

template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t num_lanes) {
  if (num_lanes >= Lanes(d)) return LoadU(d, p);

  const Half<decltype(d)> dh;
  const size_t half_N = Lanes(dh);
  if (num_lanes <= half_N) {
    return ZeroExtendVector(d, LoadN(dh, p, num_lanes));
  } else {
    const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
    const VFromD<decltype(dh)> v_hi = LoadN(dh, p + half_N, num_lanes - half_N);
    return Combine(d, v_hi, v_lo);
  }
}

template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t num_lanes) {
  if (num_lanes >= Lanes(d)) return LoadU(d, p);

  const Half<decltype(d)> dh;
  const size_t half_N = Lanes(dh);
  const VFromD<decltype(dh)> no_h = LowerHalf(no);
  if (num_lanes <= half_N) {
    return ConcatUpperLower(d, no,
                            ResizeBitCast(d, LoadNOr(no_h, dh, p, num_lanes)));
  } else {
    const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
    const VFromD<decltype(dh)> v_hi =
        LoadNOr(no_h, dh, p + half_N, num_lanes - half_N);
    return Combine(d, v_hi, v_lo);
  }
}

#endif  // HWY_MAX_BYTES >= 32

template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t num_lanes) {
  const RebindToUnsigned<D> du;
  return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
}

template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t num_lanes) {
  const RebindToUnsigned<D> du;
  return BitCast(
      d, LoadNOr(BitCast(du, no), du, detail::U16LanePointer(p), num_lanes));
}

#else  // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE

// For SVE and non-sanitizer AVX-512; RVV has its own specialization.
template <class D>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t num_lanes) {
#if HWY_MEM_OPS_MIGHT_FAULT
  if (num_lanes <= 0) return Zero(d);
#endif

  return MaskedLoad(FirstN(d, num_lanes), d, p);
}

template <class D>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t num_lanes) {
#if HWY_MEM_OPS_MIGHT_FAULT
  if (num_lanes <= 0) return no;
#endif

  return MaskedLoadOr(no, FirstN(d, num_lanes), d, p);
}

#endif  // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
#endif  // HWY_NATIVE_LOAD_N

// ------------------------------ StoreN
#if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_STORE_N
#undef HWY_NATIVE_STORE_N
#else
#define HWY_NATIVE_STORE_N
#endif

#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
namespace detail {

template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
  constexpr size_t kMinShrVectBytes =
      (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) ? 8 : 16;
  const FixedTag<uint8_t, kMinShrVectBytes> d_shift;
  return ResizeBitCast(
      dh, ShiftRightBytes<dh.MaxBytes()>(d_shift, ResizeBitCast(d_shift, v)));
}

template <class DH, HWY_IF_V_SIZE_GT_D(DH, 4)>
HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
  return UpperHalf(dh, v);
}

}  // namespace detail

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
          typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  if (max_lanes_to_store > 0) {
    StoreU(v, d, p);
  }
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
          typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  if (max_lanes_to_store > 1) {
    StoreU(v, d, p);
  } else if (max_lanes_to_store == 1) {
    const FixedTag<TFromD<D>, 1> d1;
    StoreU(LowerHalf(d1, v), d1, p);
  }
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
          typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  const FixedTag<TFromD<D>, 2> d2;
  const Half<decltype(d2)> d1;

  if (max_lanes_to_store > 1) {
    if (max_lanes_to_store >= 4) {
      StoreU(v, d, p);
    } else {
      StoreU(ResizeBitCast(d2, v), d2, p);
      if (max_lanes_to_store == 3) {
        StoreU(ResizeBitCast(d1, detail::StoreNGetUpperHalf(d2, v)), d1, p + 2);
      }
    }
  } else if (max_lanes_to_store == 1) {
    StoreU(ResizeBitCast(d1, v), d1, p);
  }
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
          typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  const FixedTag<TFromD<D>, 4> d4;
  const Half<decltype(d4)> d2;
  const Half<decltype(d2)> d1;

  if (max_lanes_to_store <= 1) {
    if (max_lanes_to_store == 1) {
      StoreU(ResizeBitCast(d1, v), d1, p);
    }
  } else if (max_lanes_to_store >= 8) {
    StoreU(v, d, p);
  } else if (max_lanes_to_store >= 4) {
    StoreU(LowerHalf(d4, v), d4, p);
    StoreN(detail::StoreNGetUpperHalf(d4, v), d4, p + 4,
           max_lanes_to_store - 4);
  } else {
    StoreN(LowerHalf(d4, v), d4, p, max_lanes_to_store);
  }
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
          typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  const FixedTag<TFromD<D>, 8> d8;
  const Half<decltype(d8)> d4;
  const Half<decltype(d4)> d2;
  const Half<decltype(d2)> d1;

  if (max_lanes_to_store <= 1) {
    if (max_lanes_to_store == 1) {
      StoreU(ResizeBitCast(d1, v), d1, p);
    }
  } else if (max_lanes_to_store >= 16) {
    StoreU(v, d, p);
  } else if (max_lanes_to_store >= 8) {
    StoreU(LowerHalf(d8, v), d8, p);
    StoreN(detail::StoreNGetUpperHalf(d8, v), d8, p + 8,
           max_lanes_to_store - 8);
  } else {
    StoreN(LowerHalf(d8, v), d8, p, max_lanes_to_store);
  }
}

#if HWY_MAX_BYTES >= 32
template <class D, HWY_IF_V_SIZE_GT_D(D, 16), typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  const size_t N = Lanes(d);
  if (max_lanes_to_store >= N) {
    StoreU(v, d, p);
    return;
  }

  const Half<decltype(d)> dh;
  const size_t half_N = Lanes(dh);
  if (max_lanes_to_store <= half_N) {
    StoreN(LowerHalf(dh, v), dh, p, max_lanes_to_store);
  } else {
    StoreU(LowerHalf(dh, v), dh, p);
    StoreN(UpperHalf(dh, v), dh, p + half_N, max_lanes_to_store - half_N);
  }
}
#endif  // HWY_MAX_BYTES >= 32

#else  // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
template <class D, typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  const size_t N = Lanes(d);
  const size_t clamped_max_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
#if HWY_MEM_OPS_MIGHT_FAULT
  if (clamped_max_lanes_to_store == 0) return;
#endif

  BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p);

  detail::MaybeUnpoison(p, clamped_max_lanes_to_store);
}
#endif  // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE

#endif  // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))

// ------------------------------ Scatter

#if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SCATTER
#undef HWY_NATIVE_SCATTER
#else
#define HWY_NATIVE_SCATTER
#endif

template <class D, typename T = TFromD<D>>
HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base,
                           VFromD<RebindToSigned<D>> offset) {
  const RebindToSigned<decltype(d)> di;
  using TI = TFromD<decltype(di)>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN T lanes[MaxLanes(d)];
  Store(v, d, lanes);

  HWY_ALIGN TI offset_lanes[MaxLanes(d)];
  Store(offset, di, offset_lanes);

  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
  }
}

template <class D, typename T = TFromD<D>>
HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base,
                          VFromD<RebindToSigned<D>> index) {
  const RebindToSigned<decltype(d)> di;
  using TI = TFromD<decltype(di)>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN T lanes[MaxLanes(d)];
  Store(v, d, lanes);

  HWY_ALIGN TI index_lanes[MaxLanes(d)];
  Store(index, di, index_lanes);

  for (size_t i = 0; i < MaxLanes(d); ++i) {
    base[index_lanes[i]] = lanes[i];
  }
}

template <class D, typename T = TFromD<D>>
HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
                                T* HWY_RESTRICT base,
                                VFromD<RebindToSigned<D>> index) {
  const RebindToSigned<decltype(d)> di;
  using TI = TFromD<decltype(di)>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN T lanes[MaxLanes(d)];
  Store(v, d, lanes);

  HWY_ALIGN TI index_lanes[MaxLanes(d)];
  Store(index, di, index_lanes);

  HWY_ALIGN TI mask_lanes[MaxLanes(di)];
  Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);

  for (size_t i = 0; i < MaxLanes(d); ++i) {
    if (mask_lanes[i]) base[index_lanes[i]] = lanes[i];
  }
}

#endif  // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))

// ------------------------------ Gather

#if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_GATHER
#undef HWY_NATIVE_GATHER
#else
#define HWY_NATIVE_GATHER
#endif

template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base,
                               VFromD<RebindToSigned<D>> offset) {
  const RebindToSigned<D> di;
  using TI = TFromD<decltype(di)>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN TI offset_lanes[MaxLanes(d)];
  Store(offset, di, offset_lanes);

  HWY_ALIGN T lanes[MaxLanes(d)];
  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    HWY_DASSERT(offset_lanes[i] >= 0);
    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
  }
  return Load(d, lanes);
}

template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
                              VFromD<RebindToSigned<D>> index) {
  const RebindToSigned<D> di;
  using TI = TFromD<decltype(di)>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN TI index_lanes[MaxLanes(d)];
  Store(index, di, index_lanes);

  HWY_ALIGN T lanes[MaxLanes(d)];
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    HWY_DASSERT(index_lanes[i] >= 0);
    lanes[i] = base[index_lanes[i]];
  }
  return Load(d, lanes);
}

template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
                                    const T* HWY_RESTRICT base,
                                    VFromD<RebindToSigned<D>> index) {
  const RebindToSigned<D> di;
  using TI = TFromD<decltype(di)>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN TI index_lanes[MaxLanes(di)];
  Store(index, di, index_lanes);

  HWY_ALIGN TI mask_lanes[MaxLanes(di)];
  Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);

  HWY_ALIGN T lanes[MaxLanes(d)];
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    HWY_DASSERT(index_lanes[i] >= 0);
    lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0};
  }
  return Load(d, lanes);
}

template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
                                      const T* HWY_RESTRICT base,
                                      VFromD<RebindToSigned<D>> index) {
  const RebindToSigned<D> di;
  using TI = TFromD<decltype(di)>;
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");

  HWY_ALIGN TI index_lanes[MaxLanes(di)];
  Store(index, di, index_lanes);

  HWY_ALIGN TI mask_lanes[MaxLanes(di)];
  Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);

  HWY_ALIGN T no_lanes[MaxLanes(d)];
  Store(no, d, no_lanes);

  HWY_ALIGN T lanes[MaxLanes(d)];
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    HWY_DASSERT(index_lanes[i] >= 0);
    lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i];
  }
  return Load(d, lanes);
}

#endif  // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))

// ------------------------------ ScatterN/GatherN

template <class D, typename T = TFromD<D>>
HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
                           VFromD<RebindToSigned<D>> index,
                           const size_t max_lanes_to_store) {
  MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index);
}

template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
                               VFromD<RebindToSigned<D>> index,
                               const size_t max_lanes_to_load) {
  return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
}

// ------------------------------ Integer AbsDiff and SumsOf8AbsDiff

#if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
#undef HWY_NATIVE_INTEGER_ABS_DIFF
#else
#define HWY_NATIVE_INTEGER_ABS_DIFF
#endif

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V AbsDiff(V a, V b) {
  return Sub(Max(a, b), Min(a, b));
}

#endif  // HWY_NATIVE_INTEGER_ABS_DIFF

#if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
#else
#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
#endif

template <class V, HWY_IF_UI8_D(DFromV<V>),
          HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
HWY_API Vec<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  const RepartitionToWideX3<decltype(d)> dw;

  return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b))));
}

#endif  // HWY_NATIVE_SUMS_OF_8_ABS_DIFF

// ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64

#if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I32_SATURATED_ADDSUB
#endif

template <class V, HWY_IF_I32_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) {
  const DFromV<decltype(a)> d;
  const auto sum = Add(a, b);
  const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
  const auto overflow_result =
      Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
  return IfNegativeThenElse(overflow_mask, overflow_result, sum);
}

template <class V, HWY_IF_I32_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) {
  const DFromV<decltype(a)> d;
  const auto diff = Sub(a, b);
  const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
  const auto overflow_result =
      Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
  return IfNegativeThenElse(overflow_mask, overflow_result, diff);
}

#endif  // HWY_NATIVE_I32_SATURATED_ADDSUB

#if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
#undef HWY_NATIVE_I64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I64_SATURATED_ADDSUB
#endif

template <class V, HWY_IF_I64_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) {
  const DFromV<decltype(a)> d;
  const auto sum = Add(a, b);
  const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
  const auto overflow_result =
      Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
  return IfNegativeThenElse(overflow_mask, overflow_result, sum);
}

template <class V, HWY_IF_I64_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) {
  const DFromV<decltype(a)> d;
  const auto diff = Sub(a, b);
  const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
  const auto overflow_result =
      Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
  return IfNegativeThenElse(overflow_mask, overflow_result, diff);
}

#endif  // HWY_NATIVE_I64_SATURATED_ADDSUB

#if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
#undef HWY_NATIVE_U32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U32_SATURATED_ADDSUB
#endif

template <class V, HWY_IF_U32_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) {
  return Add(a, Min(b, Not(a)));
}

template <class V, HWY_IF_U32_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) {
  return Sub(a, Min(a, b));
}

#endif  // HWY_NATIVE_U32_SATURATED_ADDSUB

#if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
#undef HWY_NATIVE_U64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U64_SATURATED_ADDSUB
#endif

template <class V, HWY_IF_U64_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) {
  return Add(a, Min(b, Not(a)));
}

template <class V, HWY_IF_U64_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) {
  return Sub(a, Min(a, b));
}

#endif  // HWY_NATIVE_U64_SATURATED_ADDSUB

// ------------------------------ Unsigned to signed demotions

template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
          class V2 = VFromD<Rebind<TFromV<V>, DN>>,
          hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr,
          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
HWY_API VFromD<DN> DemoteTo(DN dn, V v) {
  const DFromV<decltype(v)> d;
  const RebindToSigned<decltype(d)> di;
  const RebindToUnsigned<decltype(dn)> dn_u;

  // First, do a signed to signed demotion. This will convert any values
  // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
  // negative value.
  const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v));

  // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
  // using an unsigned Min operation.
  const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());

  return BitCast(
      dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
}

#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
          class V2 = VFromD<Repartition<TFromV<V>, DN>>,
          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  const DFromV<decltype(a)> d;
  const RebindToSigned<decltype(d)> di;
  const RebindToUnsigned<decltype(dn)> dn_u;

  // First, do a signed to signed demotion. This will convert any values
  // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
  // negative value.
  const auto i2i_demote_result =
      ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b));

  // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
  // using an unsigned Min operation.
  const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());

  return BitCast(
      dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
}
#endif

// ------------------------------ PromoteLowerTo

// There is no codegen advantage for a native version of this. It is provided
// only for convenience.
template <class D, class V>
HWY_API VFromD<D> PromoteLowerTo(D d, V v) {
  // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
  // because it cannot be deduced from D (could be either bf16 or f16).
  const Rebind<TFromV<V>, decltype(d)> dh;
  return PromoteTo(d, LowerHalf(dh, v));
}

// ------------------------------ PromoteUpperTo

#if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
#undef HWY_NATIVE_PROMOTE_UPPER_TO
#else
#define HWY_NATIVE_PROMOTE_UPPER_TO
#endif

// This requires UpperHalf.
#if HWY_TARGET != HWY_SCALAR || HWY_IDE

template <class D, class V>
HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
  // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
  // because it cannot be deduced from D (could be either bf16 or f16).
  const Rebind<TFromV<V>, decltype(d)> dh;
  return PromoteTo(d, UpperHalf(dh, v));
}

#endif  // HWY_TARGET != HWY_SCALAR
#endif  // HWY_NATIVE_PROMOTE_UPPER_TO

// ------------------------------ PromoteEvenTo/PromoteOddTo

#if HWY_TARGET != HWY_SCALAR
namespace detail {

// Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as
// there are target-specific specializations for some of the
// detail::PromoteEvenTo and detail::PromoteOddTo cases on
// SVE/PPC/SSE2/SSSE3/SSE4/AVX2.

// All targets except HWY_SCALAR use the implementations of
// detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at
// least some of the PromoteEvenTo and PromoteOddTo cases.

// Signed to signed PromoteEvenTo/PromoteOddTo
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(
    hwy::SignedTag /*to_type_tag*/,
    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
#if HWY_IS_LITTLE_ENDIAN
  // On little-endian targets, need to shift each lane of the bitcasted vector
  // left by kToLaneSize * 4 bits to get the bits of the even source lanes into
  // the upper kToLaneSize * 4 bits of even_in_hi.
  const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
#else
  // On big-endian targets, the bits of the even source lanes are already in
  // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
  const auto even_in_hi = BitCast(d_to, v);
#endif

  // Right-shift even_in_hi by kToLaneSize * 4 bits
  return ShiftRight<kToLaneSize * 4>(even_in_hi);
}

template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
    hwy::SignedTag /*to_type_tag*/,
    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
#if HWY_IS_LITTLE_ENDIAN
  // On little-endian targets, the bits of the odd source lanes are already in
  // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
  const auto odd_in_hi = BitCast(d_to, v);
#else
  // On big-endian targets, need to shift each lane of the bitcasted vector left
  // by kToLaneSize * 4 bits to get the bits of the odd source lanes into the
  // upper kToLaneSize * 4 bits of odd_in_hi.
  const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
#endif

  // Right-shift odd_in_hi by kToLaneSize * 4 bits
  return ShiftRight<kToLaneSize * 4>(odd_in_hi);
}

// Unsigned to unsigned PromoteEvenTo/PromoteOddTo
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(
    hwy::UnsignedTag /*to_type_tag*/,
    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
#if HWY_IS_LITTLE_ENDIAN
  // On little-endian targets, the bits of the even source lanes are already
  // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.

  // Simply need to zero out the upper bits of each lane of the bitcasted
  // vector.
  return And(BitCast(d_to, v),
             Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
#else
  // On big-endian targets, need to shift each lane of the bitcasted vector
  // right by kToLaneSize * 4 bits to get the bits of the even source lanes into
  // the lower kToLaneSize * 4 bits of the result.

  // The right shift below will zero out the upper kToLaneSize * 4 bits of the
  // result.
  return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
#endif
}

template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
    hwy::UnsignedTag /*to_type_tag*/,
    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
#if HWY_IS_LITTLE_ENDIAN
  // On little-endian targets, need to shift each lane of the bitcasted vector
  // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into
  // the lower kToLaneSize * 4 bits of the result.

  // The right shift below will zero out the upper kToLaneSize * 4 bits of the
  // result.
  return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
#else
  // On big-endian targets, the bits of the even source lanes are already
  // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.

  // Simply need to zero out the upper bits of each lane of the bitcasted
  // vector.
  return And(BitCast(d_to, v),
             Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
#endif
}

// Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo
// followed by BitCast to signed
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(
    hwy::SignedTag /*to_type_tag*/,
    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
  const RebindToUnsigned<decltype(d_to)> du_to;
  return BitCast(d_to,
                 PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
                               hwy::UnsignedTag(), du_to, v));
}

template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
    hwy::SignedTag /*to_type_tag*/,
    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
  const RebindToUnsigned<decltype(d_to)> du_to;
  return BitCast(d_to,
                 PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
                              hwy::UnsignedTag(), du_to, v));
}

// BF16->F32 PromoteEvenTo

// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
// instead of hwy::FloatTag on targets that use scalable vectors.

// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>

// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
// to be a bfloat16_t vector.
template <class FromTypeTag, class DF32, class VBF16,
          class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
          hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
                                      hwy::SizeTag<4> /*to_lane_size_tag*/,
                                      FromTypeTag /*from_type_tag*/, DF32 d_to,
                                      VBF16 v) {
  const RebindToUnsigned<decltype(d_to)> du_to;
#if HWY_IS_LITTLE_ENDIAN
  // On little-endian platforms, need to shift left each lane of the bitcasted
  // vector by 16 bits.
  return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
#else
  // On big-endian platforms, the even lanes of the source vector are already
  // in the upper 16 bits of the lanes of the bitcasted vector.

  // Need to simply zero out the lower 16 bits of each lane of the bitcasted
  // vector.
  return BitCast(d_to,
                 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
#endif
}

// BF16->F32 PromoteOddTo

// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
// instead of hwy::FloatTag on targets that use scalable vectors.

// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>

// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
// to be a bfloat16_t vector.
template <class FromTypeTag, class DF32, class VBF16,
          class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
          hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
                                     hwy::SizeTag<4> /*to_lane_size_tag*/,
                                     FromTypeTag /*from_type_tag*/, DF32 d_to,
                                     VBF16 v) {
  const RebindToUnsigned<decltype(d_to)> du_to;
#if HWY_IS_LITTLE_ENDIAN
  // On little-endian platforms, the odd lanes of the source vector are already
  // in the upper 16 bits of the lanes of the bitcasted vector.

  // Need to simply zero out the lower 16 bits of each lane of the bitcasted
  // vector.
  return BitCast(d_to,
                 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
#else
  // On big-endian platforms, need to shift left each lane of the bitcasted
  // vector by 16 bits.
  return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
#endif
}

// Default PromoteEvenTo/PromoteOddTo implementations
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
          class V, HWY_IF_LANES_D(D, 1)>
HWY_INLINE VFromD<D> PromoteEvenTo(
    ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    FromTypeTag /*from_type_tag*/, D d_to, V v) {
  return PromoteLowerTo(d_to, v);
}

template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
          class V, HWY_IF_LANES_GT_D(D, 1)>
HWY_INLINE VFromD<D> PromoteEvenTo(
    ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    FromTypeTag /*from_type_tag*/, D d_to, V v) {
  const DFromV<decltype(v)> d;
  return PromoteLowerTo(d_to, ConcatEven(d, v, v));
}

template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
          class V>
HWY_INLINE VFromD<D> PromoteOddTo(
    ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    FromTypeTag /*from_type_tag*/, D d_to, V v) {
  const DFromV<decltype(v)> d;
  return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
}

}  // namespace detail

template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
          class V2 = VFromD<Repartition<TFromV<V>, D>>,
          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
HWY_API VFromD<D> PromoteEvenTo(D d, V v) {
  return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
                               hwy::SizeTag<sizeof(TFromD<D>)>(),
                               hwy::TypeTag<TFromV<V>>(), d, v);
}

template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
          class V2 = VFromD<Repartition<TFromV<V>, D>>,
          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
HWY_API VFromD<D> PromoteOddTo(D d, V v) {
  return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
                              hwy::SizeTag<sizeof(TFromD<D>)>(),
                              hwy::TypeTag<TFromV<V>>(), d, v);
}
#endif  // HWY_TARGET != HWY_SCALAR

// ------------------------------ float16_t <-> float

#if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_F16C
#undef HWY_NATIVE_F16C
#else
#define HWY_NATIVE_F16C
#endif

template <class D, HWY_IF_F32_D(D)>
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) {
  const RebindToSigned<decltype(df32)> di32;
  const RebindToUnsigned<decltype(df32)> du32;
  const Rebind<uint16_t, decltype(df32)> du16;
  using VU32 = VFromD<decltype(du32)>;

  const VU32 bits16 = PromoteTo(du32, BitCast(du16, v));
  const VU32 sign = ShiftRight<15>(bits16);
  const VU32 biased_exp = And(ShiftRight<10>(bits16), Set(du32, 0x1F));
  const VU32 mantissa = And(bits16, Set(du32, 0x3FF));
  const VU32 subnormal =
      BitCast(du32, Mul(ConvertTo(df32, BitCast(di32, mantissa)),
                        Set(df32, 1.0f / 16384 / 1024)));

  const VU32 biased_exp32 = Add(biased_exp, Set(du32, 127 - 15));
  const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa);
  const VU32 normal = Or(ShiftLeft<23>(biased_exp32), mantissa32);
  const VU32 bits32 = IfThenElse(Eq(biased_exp, Zero(du32)), subnormal, normal);
  return BitCast(df32, Or(ShiftLeft<31>(sign), bits32));
}

template <class D, HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
  const RebindToSigned<decltype(df16)> di16;
  const Rebind<int32_t, decltype(df16)> di32;
  const RebindToFloat<decltype(di32)> df32;
  const RebindToUnsigned<decltype(df32)> du32;

  // There are 23 fractional bits (plus the implied 1 bit) in the mantissa of
  // a F32, and there are 10 fractional bits (plus the implied 1 bit) in the
  // mantissa of a F16

  // We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as
  // 2^(-14) is the smallest positive normal F16 value and as we want 13
  // mantissa bits (including the implicit 1 bit) to the left of the
  // F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13

  // The biased exponent of round_incr[i] needs to be at least 126 as
  // (-14) + 13 + 127 is equal to 126

  // We also want to biased exponent of round_incr[i] to be less than or equal
  // to 255 (which is equal to MaxExponentField<float>())

  // The biased F64 exponent of round_incr is equal to
  // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)

  // hi9_bits[i] is equal to the upper 9 bits of v[i]
  const auto hi9_bits = ShiftRight<23>(BitCast(du32, v));

  const auto k13 = Set(du32, uint32_t{13u});

  // Minimum biased F32 exponent of round_incr
  const auto k126 = Set(du32, uint32_t{126u});

  // round_incr_hi9_bits[i] is equivalent to
  // (hi9_bits[i] & 0x100) |
  // HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126)

#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
  const auto k255 = Set(du32, uint32_t{255u});
  const auto round_incr_hi9_bits = BitwiseIfThenElse(
      k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits);
#else
  // On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can
  // be incremented by 13 and clamped to the [13, 255] range without overflowing
  // into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8
  // exponent bits in an F32

  // U8 Max can be used on targets other than SCALAR and EMU128 to clamp
  // ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign
  // bit

  const Repartition<uint8_t, decltype(du32)> du32_as_u8;
  const auto round_incr_hi9_bits = BitCast(
      du32,
      Max(SaturatedAdd(BitCast(du32_as_u8, hi9_bits), BitCast(du32_as_u8, k13)),
          BitCast(du32_as_u8, k126)));
#endif

  // (round_incr_hi9_bits >> 8) is equal to (hi9_bits >> 8), and
  // (round_incr_hi9_bits & 0xFF) is equal to
  // HWY_MAX(HWY_MIN((round_incr_hi9_bits & 0xFF) + 13, 255), 126)

  const auto round_incr = BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits));

  // Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa
  // and to move the fractional bits of the resulting non-NaN mantissa down to
  // the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN
  // value
  const auto rounded_val = Add(v, round_incr);

  // rounded_val_bits is the bits of rounded_val as a U32
  const auto rounded_val_bits = BitCast(du32, rounded_val);

  // rounded_val[i] is known to have the same biased exponent as round_incr[i]
  // as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite
  // value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]|
  // is either a power of 2 that is greater than or equal to 2^-1 or infinity.

  // If rounded_val[i] is a finite F32 value, then
  // (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the
  // rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is
  // in the range [0, 2].

  // In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800,
  // with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the
  // resulting F16 mantissa, if rounded_v[i] is a finite F32 value.

  // (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if
  // rounded_val[i] is a non-NaN value

  // The biased exponent of rounded_val[i] is guaranteed to be at least 126 as
  // the biased exponent of round_incr[i] is at least 126 and as both v[i] and
  // round_incr[i] have the same sign bit

  // The ULP of a F32 value with a biased exponent of 126 is equal to
  // 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a
  // F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to
  // -24)

  // The biased exponent (before subtracting by 126) needs to be clamped to the
  // [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest
  // biased exponent of a F16.

  // The biased exponent of the resulting F16 value is equal to
  // HWY_MIN((round_incr_hi9_bits[i] & 0xFF) +
  //         ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126

#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
  auto f16_exp_bits =
      Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)),
              And(rounded_val_bits,
                  Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))),
          Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10)));
#else
  auto f16_exp_bits = ShiftLeft<10>(BitCast(
      du32,
      Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits),
                       BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
          BitCast(du32_as_u8, Set(du32, uint32_t{157})))));
#endif

  f16_exp_bits =
      Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10)));

  const auto f16_unmasked_mant_bits =
      BitCast(di32, Or(rounded_val, VecFromMask(df32, IsNaN(rounded_val))));

  const auto f16_exp_mant_bits =
      OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
            Set(di32, int32_t{0x03FF}));

  // f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17
  // bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow
  // efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo
  // operation
  const auto f16_bits_as_i32 =
      OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)),
            Set(di32, static_cast<int32_t>(0xFFFF8000u)));
  return BitCast(df16, DemoteTo(di16, f16_bits_as_i32));
}

#endif  // HWY_NATIVE_F16C

// ------------------------------ F64->F16 DemoteTo
#if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
#else
#define HWY_NATIVE_DEMOTE_F64_TO_F16
#endif

#if HWY_HAVE_FLOAT64
template <class D, HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
  const Rebind<double, D> df64;
  const Rebind<uint64_t, D> du64;
  const Rebind<float, D> df32;

  // The mantissa bits of v[i] are first rounded using round-to-odd rounding to
  // the nearest F64 value that has the lower 29 bits zeroed out to ensure that
  // the result is correctly rounded to a F16.

  const auto vf64_rounded = OrAnd(
      And(v,
          BitCast(df64, Set(du64, static_cast<uint64_t>(0xFFFFFFFFE0000000u)))),
      BitCast(df64, Add(BitCast(du64, v),
                        Set(du64, static_cast<uint64_t>(0x000000001FFFFFFFu)))),
      BitCast(df64, Set(du64, static_cast<uint64_t>(0x0000000020000000ULL))));

  return DemoteTo(df16, DemoteTo(df32, vf64_rounded));
}
#endif  // HWY_HAVE_FLOAT64

#endif  // HWY_NATIVE_DEMOTE_F64_TO_F16

// ------------------------------ F16->F64 PromoteTo
#if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
#undef HWY_NATIVE_PROMOTE_F16_TO_F64
#else
#define HWY_NATIVE_PROMOTE_F16_TO_F64
#endif

#if HWY_HAVE_FLOAT64
template <class D, HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
  return PromoteTo(df64, PromoteTo(Rebind<float, D>(), v));
}
#endif  // HWY_HAVE_FLOAT64

#endif  // HWY_NATIVE_PROMOTE_F16_TO_F64

// ------------------------------ SumsOf2

#if HWY_TARGET != HWY_SCALAR
namespace detail {

template <class TypeTag, size_t kLaneSize, class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
    TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
  const DFromV<decltype(v)> d;
  const RepartitionToWide<decltype(d)> dw;
  return Add(PromoteEvenTo(dw, v), PromoteOddTo(dw, v));
}

}  // namespace detail

template <class V>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(V v) {
  return detail::SumsOf2(hwy::TypeTag<TFromV<V>>(),
                         hwy::SizeTag<sizeof(TFromV<V>)>(), v);
}
#endif  // HWY_TARGET != HWY_SCALAR

// ------------------------------ SumsOf4

namespace detail {

template <class TypeTag, size_t kLaneSize, class V>
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
    TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
  using hwy::HWY_NAMESPACE::SumsOf2;
  return SumsOf2(SumsOf2(v));
}

}  // namespace detail

template <class V>
HWY_API VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(V v) {
  return detail::SumsOf4(hwy::TypeTag<TFromV<V>>(),
                         hwy::SizeTag<sizeof(TFromV<V>)>(), v);
}

// ------------------------------ OrderedTruncate2To

#if HWY_IDE || \
    (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE))

#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#else
#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#endif

// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
          HWY_IF_LANES_D(DFromV<VFromD<DN>>, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
  return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
}
#endif  // HWY_TARGET != HWY_SCALAR
#endif  // HWY_NATIVE_ORDERED_TRUNCATE_2_TO

// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex

#if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
#undef HWY_NATIVE_LEADING_ZERO_COUNT
#else
#define HWY_NATIVE_LEADING_ZERO_COUNT
#endif

namespace detail {

template <class D, HWY_IF_U32_D(D)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  const RebindToFloat<decltype(d)> df;
#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
  const RebindToSigned<decltype(d)> di;
  const Repartition<int16_t, decltype(d)> di16;

  // On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed
  // by a unsigned right shift of the uint32_t bit representation of the
  // floating point values by 23, followed by an int16_t Min
  // operation as we are only interested in the biased exponent that would
  // result from a uint32_t to float conversion.

  // An int32_t to float vector conversion is also much more efficient on
  // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion
  // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2
  // requires multiple instructions whereas an int32_t to float vector
  // conversion can be carried out using a single instruction on
  // SSE2/SSSE3/SSE4/AVX2.

  const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(di, v)));
  return BitCast(d, Min(BitCast(di16, ShiftRight<23>(f32_bits)),
                        BitCast(di16, Set(d, 158))));
#else
  const auto f32_bits = BitCast(d, ConvertTo(df, v));
  return BitCast(d, ShiftRight<23>(f32_bits));
#endif
}

template <class V, HWY_IF_U32_D(DFromV<V>)>
HWY_INLINE V I32RangeU32ToF32BiasedExp(V v) {
  // I32RangeU32ToF32BiasedExp is similar to UIntToF32BiasedExp, but
  // I32RangeU32ToF32BiasedExp assumes that v[i] is between 0 and 2147483647.
  const DFromV<decltype(v)> d;
  const RebindToFloat<decltype(d)> df;
#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
  const RebindToSigned<decltype(d)> d_src;
#else
  const RebindToUnsigned<decltype(d)> d_src;
#endif
  const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(d_src, v)));
  return ShiftRight<23>(f32_bits);
}

template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  const Rebind<uint32_t, decltype(d)> du32;
  const auto f32_biased_exp_as_u32 =
      I32RangeU32ToF32BiasedExp(PromoteTo(du32, v));
  return TruncateTo(d, f32_biased_exp_as_u32);
}

#if HWY_TARGET != HWY_SCALAR
template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  const Half<decltype(d)> dh;
  const Rebind<uint32_t, decltype(dh)> du32;

  const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
  const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));

  const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
  const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);
#if HWY_TARGET <= HWY_SSE2
  const RebindToSigned<decltype(du32)> di32;
  const RebindToSigned<decltype(d)> di;
  return BitCast(d,
                 OrderedDemote2To(di, BitCast(di32, lo_f32_biased_exp_as_u32),
                                  BitCast(di32, hi_f32_biased_exp_as_u32)));
#else
  return OrderedTruncate2To(d, lo_f32_biased_exp_as_u32,
                            hi_f32_biased_exp_as_u32);
#endif
}
#endif  // HWY_TARGET != HWY_SCALAR

template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  const Rebind<uint32_t, decltype(d)> du32;
  const auto f32_biased_exp_as_u32 =
      I32RangeU32ToF32BiasedExp(PromoteTo(du32, v));
  return U8FromU32(f32_biased_exp_as_u32);
}

#if HWY_TARGET != HWY_SCALAR
template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4),
          HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 2)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  const Half<decltype(d)> dh;
  const Rebind<uint32_t, decltype(dh)> du32;
  const Repartition<uint16_t, decltype(du32)> du16;

  const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
  const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));

  const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
  const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);

#if HWY_TARGET <= HWY_SSE2
  const RebindToSigned<decltype(du32)> di32;
  const RebindToSigned<decltype(du16)> di16;
  const auto f32_biased_exp_as_i16 =
      OrderedDemote2To(di16, BitCast(di32, lo_f32_biased_exp_as_u32),
                       BitCast(di32, hi_f32_biased_exp_as_u32));
  return DemoteTo(d, f32_biased_exp_as_i16);
#else
  const auto f32_biased_exp_as_u16 = OrderedTruncate2To(
      du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32);
  return TruncateTo(d, f32_biased_exp_as_u16);
#endif
}

template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 2)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  const Half<decltype(d)> dh;
  const Half<decltype(dh)> dq;
  const Rebind<uint32_t, decltype(dq)> du32;
  const Repartition<uint16_t, decltype(du32)> du16;

  const auto lo_half = LowerHalf(dh, v);
  const auto hi_half = UpperHalf(dh, v);

  const auto u32_q0 = PromoteTo(du32, LowerHalf(dq, lo_half));
  const auto u32_q1 = PromoteTo(du32, UpperHalf(dq, lo_half));
  const auto u32_q2 = PromoteTo(du32, LowerHalf(dq, hi_half));
  const auto u32_q3 = PromoteTo(du32, UpperHalf(dq, hi_half));

  const auto f32_biased_exp_as_u32_q0 = I32RangeU32ToF32BiasedExp(u32_q0);
  const auto f32_biased_exp_as_u32_q1 = I32RangeU32ToF32BiasedExp(u32_q1);
  const auto f32_biased_exp_as_u32_q2 = I32RangeU32ToF32BiasedExp(u32_q2);
  const auto f32_biased_exp_as_u32_q3 = I32RangeU32ToF32BiasedExp(u32_q3);

#if HWY_TARGET <= HWY_SSE2
  const RebindToSigned<decltype(du32)> di32;
  const RebindToSigned<decltype(du16)> di16;

  const auto lo_f32_biased_exp_as_i16 =
      OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q0),
                       BitCast(di32, f32_biased_exp_as_u32_q1));
  const auto hi_f32_biased_exp_as_i16 =
      OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q2),
                       BitCast(di32, f32_biased_exp_as_u32_q3));
  return OrderedDemote2To(d, lo_f32_biased_exp_as_i16,
                          hi_f32_biased_exp_as_i16);
#else
  const auto lo_f32_biased_exp_as_u16 = OrderedTruncate2To(
      du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1);
  const auto hi_f32_biased_exp_as_u16 = OrderedTruncate2To(
      du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3);
  return OrderedTruncate2To(d, lo_f32_biased_exp_as_u16,
                            hi_f32_biased_exp_as_u16);
#endif
}
#endif  // HWY_TARGET != HWY_SCALAR

#if HWY_TARGET == HWY_SCALAR
template <class D>
using F32ExpLzcntMinMaxRepartition = RebindToUnsigned<D>;
#elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2
template <class D>
using F32ExpLzcntMinMaxRepartition = Repartition<uint8_t, D>;
#else
template <class D>
using F32ExpLzcntMinMaxRepartition =
    Repartition<UnsignedFromSize<HWY_MIN(sizeof(TFromD<D>), 4)>, D>;
#endif

template <class V>
using F32ExpLzcntMinMaxCmpV = VFromD<F32ExpLzcntMinMaxRepartition<DFromV<V>>>;

template <class V>
HWY_INLINE F32ExpLzcntMinMaxCmpV<V> F32ExpLzcntMinMaxBitCast(V v) {
  const DFromV<decltype(v)> d;
  const F32ExpLzcntMinMaxRepartition<decltype(d)> d2;
  return BitCast(d2, v);
}

template <class D, HWY_IF_U64_D(D)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
#if HWY_TARGET == HWY_SCALAR
  const uint64_t u64_val = GetLane(v);
  const float f32_val = static_cast<float>(u64_val);
  const uint32_t f32_bits = BitCastScalar<uint32_t>(f32_val);
  return Set(d, static_cast<uint64_t>(f32_bits >> 23));
#else
  const Repartition<uint32_t, decltype(d)> du32;
  const auto f32_biased_exp = UIntToF32BiasedExp(du32, BitCast(du32, v));
  const auto f32_biased_exp_adj =
      IfThenZeroElse(Eq(f32_biased_exp, Zero(du32)),
                     BitCast(du32, Set(d, 0x0000002000000000u)));
  const auto adj_f32_biased_exp = Add(f32_biased_exp, f32_biased_exp_adj);

  return ShiftRight<32>(BitCast(
      d, Max(F32ExpLzcntMinMaxBitCast(adj_f32_biased_exp),
             F32ExpLzcntMinMaxBitCast(Reverse2(du32, adj_f32_biased_exp)))));
#endif
}

template <class V, HWY_IF_UNSIGNED_V(V)>
HWY_INLINE V UIntToF32BiasedExp(V v) {
  const DFromV<decltype(v)> d;
  return UIntToF32BiasedExp(d, v);
}

template <class V, HWY_IF_UNSIGNED_V(V),
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) {
  return v;
}

template <class V, HWY_IF_UNSIGNED_V(V),
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) {
  // If v[i] >= 16777216 is true, make sure that the bit at
  // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact
  // conversion to single-precision floating point is rounded down.

  // This zeroing-out can be accomplished through the AndNot operation below.
  return AndNot(ShiftRight<24>(v), v);
}

}  // namespace detail

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V HighestSetBitIndex(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;

  const auto f32_biased_exp = detail::UIntToF32BiasedExp(
      detail::NormalizeForUIntTruncConvToF32(BitCast(du, v)));
  return BitCast(d, Sub(f32_biased_exp, Set(du, TU{127})));
}

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V LeadingZeroCount(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;

  constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
  const auto f32_biased_exp = detail::UIntToF32BiasedExp(
      detail::NormalizeForUIntTruncConvToF32(BitCast(du, v)));
  const auto lz_count = Sub(Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp);

  return BitCast(d,
                 Min(detail::F32ExpLzcntMinMaxBitCast(lz_count),
                     detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
}

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V TrailingZeroCount(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const RebindToSigned<decltype(d)> di;
  using TU = TFromD<decltype(du)>;

  const auto vi = BitCast(di, v);
  const auto lowest_bit = BitCast(du, And(vi, Neg(vi)));

  constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
  const auto f32_biased_exp = detail::UIntToF32BiasedExp(lowest_bit);
  const auto tz_count = Sub(f32_biased_exp, Set(du, TU{127}));

  return BitCast(d,
                 Min(detail::F32ExpLzcntMinMaxBitCast(tz_count),
                     detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
}
#endif  // HWY_NATIVE_LEADING_ZERO_COUNT

// ------------------------------ AESRound

// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
#if HWY_TARGET != HWY_SCALAR || HWY_IDE

// Define for white-box testing, even if native instructions are available.
namespace detail {

// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
// Vector Permute Instructions" and the accompanying assembly language
// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
//
// A brute-force 256 byte table lookup can also be made constant-time, and
// possibly competitive on NEON, but this is more performance-portable
// especially for x86 and large vectors.

template <class V>  // u8
HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
                                               V affine_tblU) {
  const DFromV<V> du;
  const auto mask = Set(du, uint8_t{0xF});

  // Change polynomial basis to GF(2^4)
  {
    const VFromD<decltype(du)> basisL =
        Dup128VecFromValues(du, 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
                            0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA);
    const VFromD<decltype(du)> basisU =
        Dup128VecFromValues(du, 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
                            0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD);
    const auto sL = And(state, mask);
    const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
    const auto gf4L = TableLookupBytes(basisL, sL);
    const auto gf4U = TableLookupBytes(basisU, sU);
    state = Xor(gf4L, gf4U);
  }

  // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
  // cause TableLookupBytesOr0 to return 0.
  const VFromD<decltype(du)> zetaInv = Dup128VecFromValues(
      du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3);
  const VFromD<decltype(du)> tbl = Dup128VecFromValues(
      du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4);
  const auto sL = And(state, mask);      // L=low nibble, U=upper
  const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
  const auto sX = Xor(sU, sL);
  const auto invL = TableLookupBytes(zetaInv, sL);
  const auto invU = TableLookupBytes(tbl, sU);
  const auto invX = TableLookupBytes(tbl, sX);
  const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
  const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));

  const auto affL = TableLookupBytesOr0(affine_tblL, outL);
  const auto affU = TableLookupBytesOr0(affine_tblU, outU);
  return Xor(affL, affU);
}

template <class V>  // u8
HWY_INLINE V SubBytes(V state) {
  const DFromV<V> du;
  // Linear skew (cannot bake 0x63 bias into the table because out* indices
  // may have the infinity flag set).
  const VFromD<decltype(du)> affineL =
      Dup128VecFromValues(du, 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
                          0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15);
  const VFromD<decltype(du)> affineU =
      Dup128VecFromValues(du, 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
                          0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E);
  return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU),
             Set(du, uint8_t{0x63}));
}

template <class V>  // u8
HWY_INLINE V InvSubBytes(V state) {
  const DFromV<V> du;
  const VFromD<decltype(du)> gF2P4InvToGF2P8InvL =
      Dup128VecFromValues(du, 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
                          0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7);
  const VFromD<decltype(du)> gF2P4InvToGF2P8InvU =
      Dup128VecFromValues(du, 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
                          0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA);

  // Apply the inverse affine transformation
  const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)),
                          Or(ShiftLeft<3>(state), ShiftRight<5>(state)),
                          Or(ShiftLeft<6>(state), ShiftRight<2>(state))),
                     Set(du, uint8_t{0x05}));

  // The GF(2^8) multiplicative inverse is computed as follows:
  // - Changing the polynomial basis to GF(2^4)
  // - Computing the GF(2^4) multiplicative inverse
  // - Converting the GF(2^4) multiplicative inverse to the GF(2^8)
  //   multiplicative inverse through table lookups using the
  //   kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables
  return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL,
                                           gF2P4InvToGF2P8InvU);
}

}  // namespace detail

#endif  // HWY_TARGET != HWY_SCALAR

#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_AES
#undef HWY_NATIVE_AES
#else
#define HWY_NATIVE_AES
#endif

// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
#if HWY_TARGET != HWY_SCALAR

namespace detail {

template <class V>  // u8
HWY_INLINE V ShiftRows(const V state) {
  const DFromV<V> du;
  // transposed: state is column major
  const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
      du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11);
  return TableLookupBytes(state, shift_row);
}

template <class V>  // u8
HWY_INLINE V InvShiftRows(const V state) {
  const DFromV<V> du;
  // transposed: state is column major
  const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
      du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3);
  return TableLookupBytes(state, shift_row);
}

template <class V>  // u8
HWY_INLINE V GF2P8Mod11BMulBy2(V v) {
  const DFromV<V> du;
  const RebindToSigned<decltype(du)> di;  // can only do signed comparisons
  const auto msb = Lt(BitCast(di, v), Zero(di));
  const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B})));
  return Xor(Add(v, v), overflow);  // = v*2 in GF(2^8).
}

template <class V>  // u8
HWY_INLINE V MixColumns(const V state) {
  const DFromV<V> du;
  // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
  // 2 3 1 1  // Let s := state*1, d := state*2, t := state*3.
  // 1 2 3 1  // d are on diagonal, no permutation needed.
  // 1 1 2 3  // t1230 indicates column indices of threes for the 4 rows.
  // 3 1 1 2  // We also need to compute s2301 and s3012 (=1230 o 2301).
  const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
      du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
  const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
      du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
  const auto d = GF2P8Mod11BMulBy2(state);  // = state*2 in GF(2^8).
  const auto s2301 = TableLookupBytes(state, v2301);
  const auto d_s2301 = Xor(d, s2301);
  const auto t_s2301 = Xor(state, d_s2301);  // t(s*3) = XOR-sum {s, d(s*2)}
  const auto t1230_s3012 = TableLookupBytes(t_s2301, v1230);
  return Xor(d_s2301, t1230_s3012);  // XOR-sum of 4 terms
}

template <class V>  // u8
HWY_INLINE V InvMixColumns(const V state) {
  const DFromV<V> du;
  // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
  // 14 11 13  9
  //  9 14 11 13
  // 13  9 14 11
  // 11 13  9 14
  const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
      du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
  const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
      du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);

  const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */
  const auto sx4 = GF2P8Mod11BMulBy2(sx2);   /* = state*4 in GF(2^8) */
  const auto sx8 = GF2P8Mod11BMulBy2(sx4);   /* = state*8 in GF(2^8) */
  const auto sx9 = Xor(sx8, state);          /* = state*9 in GF(2^8) */
  const auto sx11 = Xor(sx9, sx2);           /* = state*11 in GF(2^8) */
  const auto sx13 = Xor(sx9, sx4);           /* = state*13 in GF(2^8) */
  const auto sx14 = Xor3(sx8, sx4, sx2);     /* = state*14 in GF(2^8) */

  const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230));
  const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230));
  const auto sx13_2301_sx9_3012 = TableLookupBytes(sx13_0123_sx9_1230, v2301);
  return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012);
}

}  // namespace detail

template <class V>  // u8
HWY_API V AESRound(V state, const V round_key) {
  // Intel docs swap the first two steps, but it does not matter because
  // ShiftRows is a permutation and SubBytes is independent of lane index.
  state = detail::SubBytes(state);
  state = detail::ShiftRows(state);
  state = detail::MixColumns(state);
  state = Xor(state, round_key);  // AddRoundKey
  return state;
}

template <class V>  // u8
HWY_API V AESLastRound(V state, const V round_key) {
  // LIke AESRound, but without MixColumns.
  state = detail::SubBytes(state);
  state = detail::ShiftRows(state);
  state = Xor(state, round_key);  // AddRoundKey
  return state;
}

template <class V>
HWY_API V AESInvMixColumns(V state) {
  return detail::InvMixColumns(state);
}

template <class V>  // u8
HWY_API V AESRoundInv(V state, const V round_key) {
  state = detail::InvSubBytes(state);
  state = detail::InvShiftRows(state);
  state = detail::InvMixColumns(state);
  state = Xor(state, round_key);  // AddRoundKey
  return state;
}

template <class V>  // u8
HWY_API V AESLastRoundInv(V state, const V round_key) {
  // Like AESRoundInv, but without InvMixColumns.
  state = detail::InvSubBytes(state);
  state = detail::InvShiftRows(state);
  state = Xor(state, round_key);  // AddRoundKey
  return state;
}

template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)>
HWY_API V AESKeyGenAssist(V v) {
  const DFromV<decltype(v)> d;
  const V rconXorMask = Dup128VecFromValues(d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0,
                                            0, 0, kRcon, 0, 0, 0);
  const V rotWordShuffle = Dup128VecFromValues(d, 4, 5, 6, 7, 5, 6, 7, 4, 12,
                                               13, 14, 15, 13, 14, 15, 12);
  const auto sub_word_result = detail::SubBytes(v);
  const auto rot_word_result =
      TableLookupBytes(sub_word_result, rotWordShuffle);
  return Xor(rot_word_result, rconXorMask);
}

// Constant-time implementation inspired by
// https://www.bearssl.org/constanttime.html, but about half the cost because we
// use 64x64 multiplies and 128-bit XORs.
template <class V>
HWY_API V CLMulLower(V a, V b) {
  const DFromV<V> d;
  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
  const auto k1 = Set(d, 0x1111111111111111ULL);
  const auto k2 = Set(d, 0x2222222222222222ULL);
  const auto k4 = Set(d, 0x4444444444444444ULL);
  const auto k8 = Set(d, 0x8888888888888888ULL);
  const auto a0 = And(a, k1);
  const auto a1 = And(a, k2);
  const auto a2 = And(a, k4);
  const auto a3 = And(a, k8);
  const auto b0 = And(b, k1);
  const auto b1 = And(b, k2);
  const auto b2 = And(b, k4);
  const auto b3 = And(b, k8);

  auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
  auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
  auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
  auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
  m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
  m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
  m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
  m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
}

template <class V>
HWY_API V CLMulUpper(V a, V b) {
  const DFromV<V> d;
  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
  const auto k1 = Set(d, 0x1111111111111111ULL);
  const auto k2 = Set(d, 0x2222222222222222ULL);
  const auto k4 = Set(d, 0x4444444444444444ULL);
  const auto k8 = Set(d, 0x8888888888888888ULL);
  const auto a0 = And(a, k1);
  const auto a1 = And(a, k2);
  const auto a2 = And(a, k4);
  const auto a3 = And(a, k8);
  const auto b0 = And(b, k1);
  const auto b1 = And(b, k2);
  const auto b2 = And(b, k4);
  const auto b3 = And(b, k8);

  auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
  auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
  auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
  auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
  m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
  m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
  m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
  m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
}

#endif  // HWY_NATIVE_AES
#endif  // HWY_TARGET != HWY_SCALAR

// ------------------------------ PopulationCount

#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_POPCNT
#undef HWY_NATIVE_POPCNT
#else
#define HWY_NATIVE_POPCNT
#endif

// This overload requires vectors to be at least 16 bytes, which is the case
// for LMUL >= 2.
#undef HWY_IF_POPCNT
#if HWY_TARGET == HWY_RVV
#define HWY_IF_POPCNT(D) \
  hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
#else
// Other targets only have these two overloads which are mutually exclusive, so
// no further conditions are required.
#define HWY_IF_POPCNT(D) void* = nullptr
#endif  // HWY_TARGET == HWY_RVV

template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
          HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
HWY_API V PopulationCount(V v) {
  const D d;
  const V lookup =
      Dup128VecFromValues(d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
  const auto lo = And(v, Set(d, uint8_t{0xF}));
  const auto hi = ShiftRight<4>(v);
  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
}

// RVV has a specialization that avoids the Set().
#if HWY_TARGET != HWY_RVV
// Slower fallback for capped vectors.
template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
          HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API V PopulationCount(V v) {
  const D d;
  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
  const V k33 = Set(d, uint8_t{0x33});
  v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
  v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
  return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
}
#endif  // HWY_TARGET != HWY_RVV

template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
HWY_API V PopulationCount(V v) {
  const D d;
  const Repartition<uint8_t, decltype(d)> d8;
  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
  return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
}

template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
HWY_API V PopulationCount(V v) {
  const D d;
  Repartition<uint16_t, decltype(d)> d16;
  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
  return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
}

#if HWY_HAVE_INTEGER64
template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
HWY_API V PopulationCount(V v) {
  const D d;
  Repartition<uint32_t, decltype(d)> d32;
  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
}
#endif

#endif  // HWY_NATIVE_POPCNT

// ------------------------------ 8-bit multiplication

#if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
#ifdef HWY_NATIVE_MUL_8
#undef HWY_NATIVE_MUL_8
#else
#define HWY_NATIVE_MUL_8
#endif

// 8 bit and fits in wider reg: promote
template <class V, HWY_IF_T_SIZE_V(V, 1),
          HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
HWY_API V operator*(const V a, const V b) {
  const DFromV<decltype(a)> d;
  const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
  const RebindToUnsigned<decltype(d)> du;    // TruncateTo result
  const RebindToUnsigned<decltype(dw)> dwu;  // TruncateTo input
  const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
  // TruncateTo is cheaper than ConcatEven.
  return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
}

// 8 bit full reg: promote halves
template <class V, HWY_IF_T_SIZE_V(V, 1),
          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_API V operator*(const V a, const V b) {
  const DFromV<decltype(a)> d;
  const Half<decltype(d)> dh;
  const Twice<RepartitionToWide<decltype(dh)>> dw;
  const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
  const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
  const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
  const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
  const VFromD<decltype(dw)> m0 = a0 * b0;
  const VFromD<decltype(dw)> m1 = a1 * b1;
  return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
}

#endif  // HWY_NATIVE_MUL_8

// ------------------------------ 64-bit multiplication

#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
#ifdef HWY_NATIVE_MUL_64
#undef HWY_NATIVE_MUL_64
#else
#define HWY_NATIVE_MUL_64
#endif

// Single-lane i64 or u64
template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
          HWY_IF_NOT_FLOAT_V(V)>
HWY_API V operator*(V x, V y) {
  const DFromV<V> d;
  using T = TFromD<decltype(d)>;
  using TU = MakeUnsigned<T>;
  const TU xu = static_cast<TU>(GetLane(x));
  const TU yu = static_cast<TU>(GetLane(y));
  return Set(d, static_cast<T>(xu * yu));
}

template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
          HWY_IF_V_SIZE_GT_D(D64, 8)>
HWY_API V operator*(V x, V y) {
  RepartitionToNarrow<D64> d32;
  auto x32 = BitCast(d32, x);
  auto y32 = BitCast(d32, y);
  auto lolo = BitCast(d32, MulEven(x32, y32));
  auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
  auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
  auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
  return BitCast(D64{}, lolo + hi);
}
template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
          HWY_IF_V_SIZE_GT_D(DI64, 8)>
HWY_API V operator*(V x, V y) {
  RebindToUnsigned<DI64> du64;
  return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
}

#endif  // HWY_NATIVE_MUL_64

// ------------------------------ MulAdd / NegMulAdd

#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INT_FMA
#undef HWY_NATIVE_INT_FMA
#else
#define HWY_NATIVE_INT_FMA
#endif

#ifdef HWY_NATIVE_INT_FMSUB
#undef HWY_NATIVE_INT_FMSUB
#else
#define HWY_NATIVE_INT_FMSUB
#endif

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V MulAdd(V mul, V x, V add) {
  return Add(Mul(mul, x), add);
}

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V NegMulAdd(V mul, V x, V add) {
  return Sub(add, Mul(mul, x));
}

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V MulSub(V mul, V x, V sub) {
  return Sub(Mul(mul, x), sub);
}
#endif  // HWY_NATIVE_INT_FMA

// ------------------------------ Integer MulSub / NegMulSub
#if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INT_FMSUB
#undef HWY_NATIVE_INT_FMSUB
#else
#define HWY_NATIVE_INT_FMSUB
#endif

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V MulSub(V mul, V x, V sub) {
  const DFromV<decltype(mul)> d;
  const RebindToSigned<decltype(d)> di;
  return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub))));
}

#endif  // HWY_NATIVE_INT_FMSUB

template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V NegMulSub(V mul, V x, V sub) {
  const DFromV<decltype(mul)> d;
  const RebindToSigned<decltype(d)> di;

  return BitCast(d, Neg(BitCast(di, MulAdd(mul, x, sub))));
}

// ------------------------------ MulAddSub

// MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to
// MulSub(mul, x, sub_or_add)
template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
  return MulSub(mul, x, sub_or_add);
}

// MulAddSub for F16/F32/F64 vectors with 2 or more lanes on
// SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and
// x86_512-inl.h
template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | ((HWY_TARGET <= HWY_SSSE3 &&
                                                 hwy::IsFloat<TFromV<V>>())
                                                    ? 0
                                                    : ((1 << 2) | (1 << 4) |
                                                       (1 << 8))))>
HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
  using D = DFromV<V>;
  using T = TFromD<D>;
  using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;

  const D d;
  const Rebind<TNegate, D> d_negate;

  const auto add =
      OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
  return MulAdd(mul, x, add);
}

// ------------------------------ Integer division
#if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INT_DIV
#undef HWY_NATIVE_INT_DIV
#else
#define HWY_NATIVE_INT_DIV
#endif

namespace detail {

template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) {
  return ConvertTo(di, vf);
}

template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) {
  return ConvertTo(df, vi);
}

#if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) {
  return PromoteTo(df, vi);
}

// If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32
// IntDivConvIntToFloat(df, vi) returns an approximation of
// static_cast<float>(v[i]) that is within 4 ULP of static_cast<float>(v[i])
template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) {
  const Twice<decltype(df32)> dt_f32;

  auto vf32 =
      ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi));

#if HWY_IS_LITTLE_ENDIAN
  const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
  auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
#else
  const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
  auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
#endif

  const RebindToSigned<decltype(df32)> di32;

  hi_f32 =
      Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))),
                      Set(df32, 1.0f)));
  return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
}

template <class D, class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)>
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vu) {
  const Twice<decltype(df32)> dt_f32;

  auto vf32 =
      ConvertTo(dt_f32, BitCast(RebindToUnsigned<decltype(dt_f32)>(), vu));

#if HWY_IS_LITTLE_ENDIAN
  const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
  const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
#else
  const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
  const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
#endif

  return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
}
#endif  // !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64

template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
          HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)>
HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
  const DFromV<decltype(a)> d;
  const RebindToFloat<decltype(d)> df;

  // If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the
  // [LimitsMin<SignedFromSize<kOrigLaneSize>>(),
  // LimitsMax<UnsignedFromSize<kOrigLaneSize>>()] range.

  // floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also
  // guaranteed to be true if MakeFloat<T> has at least kOrigLaneSize*8 + 1
  // mantissa bits (including the implied one bit), where flt_q is equal to
  // static_cast<MakeFloat<T>>(a[i]) / static_cast<MakeFloat<T>>(b[i]),
  // even in the case where the magnitude of an inexact floating point division
  // result is rounded up.

  // In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true
  // if (a[i] % b[i]) != 0 is true and MakeFloat<T> has at least
  // kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in
  // the case where the magnitude of an inexact floating point division result
  // is rounded up.

#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
    !HWY_HAVE_FLOAT64
  // On Armv7, do division by multiplying by the ApproximateReciprocal
  // to avoid unnecessary overhead as F32 Div refines the approximate
  // reciprocal using 4 Newton-Raphson iterations

  const RebindToSigned<decltype(d)> di;
  const RebindToUnsigned<decltype(d)> du;

  const auto flt_b = ConvertTo(df, b);
  auto flt_recip_b = ApproximateReciprocal(flt_b);
  if (kOrigLaneSize > 1) {
    flt_recip_b =
        Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
  }

  auto q0 = ConvertTo(d, Mul(ConvertTo(df, a), flt_recip_b));
  const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));

  auto r1 = r0;

  // Need to negate r1[i] if a[i] < 0 is true
  if (IsSigned<TFromV<V>>()) {
    r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1);
  }

  // r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i]

  auto abs_b = BitCast(du, b);
  if (IsSigned<TFromV<V>>()) {
    abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
  }

  // If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1.
  // Otherwise, set q1[i] to 0.

  // (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned
  // comparison as static_cast<TU>(r1[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
  // will be true if r1[i] < 0 is true.
  auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b)));

  // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0

  // Need to negate q1[i] if r0[i] and b[i] do not have the same sign
  auto q1_negate_mask = r0;
  if (IsSigned<TFromV<V>>()) {
    q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b));
  }
  q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1);

  // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ?
  //                       (((r0[i] ^ b[i]) < 0) ? 1 : -1)

  // Need to subtract q1[i] from q0[i] to get the final result
  return Sub(q0, BitCast(d, q1));
#else
  // On targets other than Armv7 NEON, use F16 or F32 division as most targets
  // other than Armv7 NEON have native F32 divide instructions
  return ConvertTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
#endif
}

template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
          HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize),
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
  // If kOrigLaneSize == sizeof(T) is true, at least two reciprocal
  // multiplication steps are needed as the mantissa of MakeFloat<T> has fewer
  // than kOrigLaneSize*8 + 1 bits

  using T = TFromV<V>;

#if HWY_HAVE_FLOAT64
  using TF = MakeFloat<T>;
#else
  using TF = float;
#endif

  const DFromV<decltype(a)> d;
  const RebindToSigned<decltype(d)> di;
  const RebindToUnsigned<decltype(d)> du;
  const Rebind<TF, decltype(d)> df;

  if (!IsSigned<T>()) {
    // If T is unsigned, set a[i] to (a[i] >= b[i] ? 1 : 0) and set b[i] to 1 if
    // b[i] > LimitsMax<MakeSigned<T>>() is true

    const auto one = Set(di, MakeSigned<T>{1});
    a = BitCast(
        d, IfNegativeThenElse(BitCast(di, b),
                              IfThenElseZero(RebindMask(di, Ge(a, b)), one),
                              BitCast(di, a)));
    b = BitCast(d, IfNegativeThenElse(BitCast(di, b), one, BitCast(di, b)));
  }

  // LimitsMin<T>() <= b[i] <= LimitsMax<MakeSigned<T>>() is now true

  const auto flt_b = IntDivConvIntToFloat(df, b);

#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
    !HWY_HAVE_FLOAT64
  auto flt_recip_b = ApproximateReciprocal(flt_b);
  flt_recip_b =
      Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
#else
  const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b);
#endif

  auto q0 =
      IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b));
  const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));

  auto q1 =
      IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b));
  const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0);

  auto r3 = r1;

#if !HWY_HAVE_FLOAT64
  // Need two additional reciprocal multiplication steps for I64/U64 vectors if
  // HWY_HAVE_FLOAT64 is 0
  if (sizeof(T) == 8) {
    const auto q2 = IntDivConvFloatToInt(
        di, Mul(IntDivConvIntToFloat(df, r1), flt_recip_b));
    const auto r2 = hwy::HWY_NAMESPACE::NegMulAdd(q2, BitCast(di, b), r1);

    const auto q3 = IntDivConvFloatToInt(
        di, Mul(IntDivConvIntToFloat(df, r2), flt_recip_b));
    r3 = hwy::HWY_NAMESPACE::NegMulAdd(q3, BitCast(di, b), r2);

    q0 = Add(q0, BitCast(d, q2));
    q1 = Add(q1, q3);
  }
#endif  // !HWY_HAVE_FLOAT64

  auto r4 = r3;

  // Need to negate r4[i] if a[i] < 0 is true
  if (IsSigned<TFromV<V>>()) {
    r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4);
  }

  // r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i]

  auto abs_b = BitCast(du, b);
  if (IsSigned<TFromV<V>>()) {
    abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
  }

  // If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1.
  // Otherwise, set r4[i] to 0.

  // (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned
  // comparison as static_cast<TU>(r4[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
  // will be true if r4[i] < 0 is true.
  auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b)));

  // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0

  // Need to negate q4[i] if r3[i] and b[i] do not have the same sign
  auto q4_negate_mask = r3;
  if (IsSigned<TFromV<V>>()) {
    q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b));
  }
  q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4);

  // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ?
  //                       (((r3[i] ^ b[i]) < 0) ? 1 : -1)

  // The final result is equal to q0[i] + q1[i] - q4[i]
  return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4));
}

template <size_t kOrigLaneSize, class V,
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
          HWY_IF_V_SIZE_LE_V(
              V, HWY_MAX_BYTES /
                     ((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))>
HWY_INLINE V IntDiv(V a, V b) {
  using T = TFromV<V>;

  // If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32
  using TW = MakeWide<
      If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1), MakeWide<T>, T>>;

  const DFromV<decltype(a)> d;
  const Rebind<TW, decltype(d)> dw;

#if HWY_TARGET <= HWY_SSE2
  // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
  // unnecessary overhead
  const RebindToSigned<decltype(dw)> dw_i;

  // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<T> if
  // kOrigLaneSize < sizeof(T) to avoid unnecessary overhead
  const If<(kOrigLaneSize < sizeof(T)), RebindToSigned<decltype(d)>,
           decltype(d)>
      d_demote_to;
#else
  // On other targets, promote to TW and demote to T
  const decltype(dw) dw_i;
  const decltype(d) d_demote_to;
#endif

  return BitCast(
      d, DemoteTo(d_demote_to, IntDivUsingFloatDiv<kOrigLaneSize>(
                                   PromoteTo(dw_i, a), PromoteTo(dw_i, b))));
}

template <size_t kOrigLaneSize, class V,
          HWY_IF_T_SIZE_ONE_OF_V(V,
                                 (HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)),
          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) {
  const DFromV<decltype(a)> d;
  const RepartitionToWide<decltype(d)> dw;

#if HWY_TARGET <= HWY_SSE2
  // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
  // unnecessary overhead
  const RebindToSigned<decltype(dw)> dw_i;

  // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<TFromV<V>> if
  // kOrigLaneSize < sizeof(TFromV<V>) to avoid unnecessary overhead
  const If<(kOrigLaneSize < sizeof(TFromV<V>)), RebindToSigned<decltype(d)>,
           decltype(d)>
      d_demote_to;
#else
  // On other targets, promote to MakeWide<TFromV<V>> and demote to TFromV<V>
  const decltype(dw) dw_i;
  const decltype(d) d_demote_to;
#endif

  return BitCast(d, OrderedDemote2To(
                        d_demote_to,
                        IntDivUsingFloatDiv<kOrigLaneSize>(
                            PromoteLowerTo(dw_i, a), PromoteLowerTo(dw_i, b)),
                        IntDivUsingFloatDiv<kOrigLaneSize>(
                            PromoteUpperTo(dw_i, a), PromoteUpperTo(dw_i, b))));
}

#if !HWY_HAVE_FLOAT16
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
          HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) {
  const DFromV<decltype(a)> d;
  const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;

#if HWY_TARGET <= HWY_SSE2
  // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
  // overhead
  const RebindToSigned<decltype(dw)> dw_i;
#else
  // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
  const decltype(dw) dw_i;
#endif

  return DemoteTo(d,
                  BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b))));
}
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) {
  const DFromV<decltype(a)> d;
  const RepartitionToWide<decltype(d)> dw;

#if HWY_TARGET <= HWY_SSE2
  // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
  // overhead
  const RebindToSigned<decltype(dw)> dw_i;
#else
  // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
  const decltype(dw) dw_i;
#endif

  return OrderedDemote2To(
      d, BitCast(dw_i, IntDiv<1>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b))),
      BitCast(dw_i, IntDiv<1>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b))));
}
#endif  // !HWY_HAVE_FLOAT16

template <size_t kOrigLaneSize, class V,
          HWY_IF_T_SIZE_ONE_OF_V(V,
                                 (HWY_HAVE_FLOAT64 ? 0 : (1 << 4)) | (1 << 8))>
HWY_INLINE V IntDiv(V a, V b) {
  return IntDivUsingFloatDiv<kOrigLaneSize>(a, b);
}

#if HWY_HAVE_FLOAT64
template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
          HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) {
  const DFromV<decltype(a)> d;
  const Rebind<double, decltype(d)> df64;

  return DemoteTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
}
template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) {
  const DFromV<decltype(a)> d;
  const Half<decltype(d)> dh;
  const Repartition<double, decltype(d)> df64;

  return Combine(
      d, DemoteTo(dh, Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b))),
      DemoteTo(dh, Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b))));
}
#endif  // HWY_HAVE_FLOAT64

template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
          HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 ||
                                      HWY_TARGET == HWY_WASM ||
                                      HWY_TARGET == HWY_WASM_EMU256)
                                         ? 0
                                         : (1 << 1)) |
                                        (1 << 2) | (1 << 4) | (1 << 8))>
HWY_INLINE V IntMod(V a, V b) {
  return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a);
}

#if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
    HWY_TARGET == HWY_WASM_EMU256
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
          HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntMod(V a, V b) {
  const DFromV<decltype(a)> d;
  const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
  return DemoteTo(d, IntMod<kOrigLaneSize>(PromoteTo(dw, a), PromoteTo(dw, b)));
}

template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntMod(V a, V b) {
  const DFromV<decltype(a)> d;
  const RepartitionToWide<decltype(d)> dw;
  return OrderedDemote2To(
      d, IntMod<kOrigLaneSize>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b)),
      IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)));
}
#endif  // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET ==
        // HWY_WASM_EMU256

}  // namespace detail

#if HWY_TARGET == HWY_SCALAR

template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec1<T> operator/(Vec1<T> a, Vec1<T> b) {
  return detail::IntDiv<sizeof(T)>(a, b);
}
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec1<T> operator%(Vec1<T> a, Vec1<T> b) {
  return detail::IntMod<sizeof(T)>(a, b);
}

#else  // HWY_TARGET != HWY_SCALAR

template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
  return detail::IntDiv<sizeof(T)>(a, b);
}

template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
  return detail::IntMod<sizeof(T)>(a, b);
}

#if HWY_CAP_GE256
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec256<T> operator/(Vec256<T> a, Vec256<T> b) {
  return detail::IntDiv<sizeof(T)>(a, b);
}
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec256<T> operator%(Vec256<T> a, Vec256<T> b) {
  return detail::IntMod<sizeof(T)>(a, b);
}
#endif

#if HWY_CAP_GE512
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec512<T> operator/(Vec512<T> a, Vec512<T> b) {
  return detail::IntDiv<sizeof(T)>(a, b);
}
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
  return detail::IntMod<sizeof(T)>(a, b);
}
#endif

#endif  // HWY_TARGET == HWY_SCALAR

#endif  // HWY_NATIVE_INT_DIV

// ------------------------------ SatWidenMulPairwiseAdd

#if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \
     defined(HWY_TARGET_TOGGLE))

#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
#else
#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
#endif

template <class DI16, class VU8, class VI8,
          class VU8_2 = Vec<Repartition<uint8_t, DI16>>, HWY_IF_I16_D(DI16),
          HWY_IF_U8_D(DFromV<VU8>), HWY_IF_I8_D(DFromV<VI8>),
          HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VI8)),
          HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VU8_2))>
HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
  const RebindToUnsigned<decltype(di16)> du16;

  const auto a0 = BitCast(di16, PromoteEvenTo(du16, a));
  const auto b0 = PromoteEvenTo(di16, b);

  const auto a1 = BitCast(di16, PromoteOddTo(du16, a));
  const auto b1 = PromoteOddTo(di16, b);

  return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
}

#endif

// ------------------------------ SumOfMulQuadAccumulate

#if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
     defined(HWY_TARGET_TOGGLE))

#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#endif

template <class DI32, HWY_IF_I32_D(DI32)>
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
                                            VFromD<Repartition<int8_t, DI32>> a,
                                            VFromD<Repartition<int8_t, DI32>> b,
                                            VFromD<DI32> sum) {
  const Repartition<int16_t, decltype(di32)> di16;

  const auto a0 = PromoteEvenTo(di16, a);
  const auto b0 = PromoteEvenTo(di16, b);

  const auto a1 = PromoteOddTo(di16, a);
  const auto b1 = PromoteOddTo(di16, b);

  return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
                      WidenMulPairwiseAdd(di32, a1, b1)));
}

#endif

#if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \
     defined(HWY_TARGET_TOGGLE))

#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#endif

template <class DU32, HWY_IF_U32_D(DU32)>
HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
    DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
  const Repartition<uint16_t, decltype(du32)> du16;
  const RebindToSigned<decltype(du16)> di16;
  const RebindToSigned<decltype(du32)> di32;

  const auto lo8_mask = Set(di16, int16_t{0x00FF});
  const auto a0 = And(BitCast(di16, a), lo8_mask);
  const auto b0 = And(BitCast(di16, b), lo8_mask);

  const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
  const auto b1 = BitCast(di16, ShiftRight<8>(BitCast(du16, b)));

  return Add(sum, Add(BitCast(du32, WidenMulPairwiseAdd(di32, a0, b0)),
                      BitCast(du32, WidenMulPairwiseAdd(di32, a1, b1))));
}

#endif

#if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \
     defined(HWY_TARGET_TOGGLE))

#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#endif

template <class DI32, HWY_IF_I32_D(DI32)>
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
    DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u,
    VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
  const Repartition<int16_t, decltype(di32)> di16;
  const RebindToUnsigned<decltype(di16)> du16;

  const auto a0 = And(BitCast(di16, a_u), Set(di16, int16_t{0x00FF}));
  const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b_i)));

  const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a_u)));
  const auto b1 = ShiftRight<8>(BitCast(di16, b_i));

  // NOTE: SatWidenMulPairwiseAdd(di16, a_u, b_i) cannot be used in
  // SumOfMulQuadAccumulate as it is possible for
  // a_u[0]*b_i[0]+a_u[1]*b_i[1] to overflow an int16_t if a_u[0], b_i[0],
  // a_u[1], and b_i[1] are all non-zero and b_i[0] and b_i[1] have the same
  // sign.

  return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
                      WidenMulPairwiseAdd(di32, a1, b1)));
}

#endif

#if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \
     defined(HWY_TARGET_TOGGLE))

#ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
#endif

#if HWY_HAVE_INTEGER64
template <class DI64, HWY_IF_I64_D(DI64)>
HWY_API VFromD<DI64> SumOfMulQuadAccumulate(
    DI64 di64, VFromD<Repartition<int16_t, DI64>> a,
    VFromD<Repartition<int16_t, DI64>> b, VFromD<DI64> sum) {
  const Repartition<int32_t, decltype(di64)> di32;

  // WidenMulPairwiseAdd(di32, a, b) is okay here as
  // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
  // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
  // a[0], b[0], a[1], and b[1] are all equal to -32768.

  const auto i32_pairwise_sum = WidenMulPairwiseAdd(di32, a, b);
  const auto i32_pairwise_sum_overflow =
      VecFromMask(di32, Eq(i32_pairwise_sum, Set(di32, LimitsMin<int32_t>())));

  // The upper 32 bits of sum0 and sum1 need to be zeroed out in the case of
  // overflow.
  const auto hi32_mask = Set(di64, static_cast<int64_t>(~int64_t{0xFFFFFFFF}));
  const auto p0_zero_out_mask =
      ShiftLeft<32>(BitCast(di64, i32_pairwise_sum_overflow));
  const auto p1_zero_out_mask =
      And(BitCast(di64, i32_pairwise_sum_overflow), hi32_mask);

  const auto p0 =
      AndNot(p0_zero_out_mask,
             ShiftRight<32>(ShiftLeft<32>(BitCast(di64, i32_pairwise_sum))));
  const auto p1 =
      AndNot(p1_zero_out_mask, ShiftRight<32>(BitCast(di64, i32_pairwise_sum)));

  return Add(sum, Add(p0, p1));
}
#endif  // HWY_HAVE_INTEGER64
#endif  // HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE

#if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \
     defined(HWY_TARGET_TOGGLE))

#ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
#endif

#if HWY_HAVE_INTEGER64
template <class DU64, HWY_IF_U64_D(DU64)>
HWY_API VFromD<DU64> SumOfMulQuadAccumulate(
    DU64 du64, VFromD<Repartition<uint16_t, DU64>> a,
    VFromD<Repartition<uint16_t, DU64>> b, VFromD<DU64> sum) {
  const auto u32_even_prod = MulEven(a, b);
  const auto u32_odd_prod = MulOdd(a, b);

  const auto p0 = Add(PromoteEvenTo(du64, u32_even_prod),
                      PromoteEvenTo(du64, u32_odd_prod));
  const auto p1 =
      Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod));

  return Add(sum, Add(p0, p1));
}
#endif  // HWY_HAVE_INTEGER64
#endif  // HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE

// ------------------------------ F64 ApproximateReciprocal

#if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_F64_APPROX_RECIP
#undef HWY_NATIVE_F64_APPROX_RECIP
#else
#define HWY_NATIVE_F64_APPROX_RECIP
#endif

#if HWY_HAVE_FLOAT64
template <class V, HWY_IF_F64_D(DFromV<V>)>
HWY_API V ApproximateReciprocal(V v) {
  const DFromV<decltype(v)> d;
  return Div(Set(d, 1.0), v);
}
#endif  // HWY_HAVE_FLOAT64

#endif  // HWY_NATIVE_F64_APPROX_RECIP

// ------------------------------ F64 ApproximateReciprocalSqrt

#if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_F64_APPROX_RSQRT
#undef HWY_NATIVE_F64_APPROX_RSQRT
#else
#define HWY_NATIVE_F64_APPROX_RSQRT
#endif

#if HWY_HAVE_FLOAT64
template <class V, HWY_IF_F64_D(DFromV<V>)>
HWY_API V ApproximateReciprocalSqrt(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const auto half = Mul(v, Set(d, 0.5));
  // Initial guess based on log2(f)
  const auto guess = BitCast(d, Sub(Set(du, uint64_t{0x5FE6EB50C7B537A9u}),
                                    ShiftRight<1>(BitCast(du, v))));
  // One Newton-Raphson iteration
  return Mul(guess, NegMulAdd(Mul(half, guess), guess, Set(d, 1.5)));
}
#endif  // HWY_HAVE_FLOAT64

#endif  // HWY_NATIVE_F64_APPROX_RSQRT

// ------------------------------ Compress*

#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_COMPRESS8
#undef HWY_NATIVE_COMPRESS8
#else
#define HWY_NATIVE_COMPRESS8
#endif

template <class V, class D, typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d,
                                 T* unaligned) {
  HWY_ALIGN T lanes[MaxLanes(d)];
  Store(v, d, lanes);

  const Simd<T, HWY_MIN(MaxLanes(d), 8), 0> d8;
  T* HWY_RESTRICT pos = unaligned;

  HWY_ALIGN constexpr T table[2048] = {
      0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
      1, 0, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
      2, 0, 1, 3, 4, 5, 6, 7, /**/ 0, 2, 1, 3, 4, 5, 6, 7,  //
      1, 2, 0, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
      3, 0, 1, 2, 4, 5, 6, 7, /**/ 0, 3, 1, 2, 4, 5, 6, 7,  //
      1, 3, 0, 2, 4, 5, 6, 7, /**/ 0, 1, 3, 2, 4, 5, 6, 7,  //
      2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 2, 3, 1, 4, 5, 6, 7,  //
      1, 2, 3, 0, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
      4, 0, 1, 2, 3, 5, 6, 7, /**/ 0, 4, 1, 2, 3, 5, 6, 7,  //
      1, 4, 0, 2, 3, 5, 6, 7, /**/ 0, 1, 4, 2, 3, 5, 6, 7,  //
      2, 4, 0, 1, 3, 5, 6, 7, /**/ 0, 2, 4, 1, 3, 5, 6, 7,  //
      1, 2, 4, 0, 3, 5, 6, 7, /**/ 0, 1, 2, 4, 3, 5, 6, 7,  //
      3, 4, 0, 1, 2, 5, 6, 7, /**/ 0, 3, 4, 1, 2, 5, 6, 7,  //
      1, 3, 4, 0, 2, 5, 6, 7, /**/ 0, 1, 3, 4, 2, 5, 6, 7,  //
      2, 3, 4, 0, 1, 5, 6, 7, /**/ 0, 2, 3, 4, 1, 5, 6, 7,  //
      1, 2, 3, 4, 0, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
      5, 0, 1, 2, 3, 4, 6, 7, /**/ 0, 5, 1, 2, 3, 4, 6, 7,  //
      1, 5, 0, 2, 3, 4, 6, 7, /**/ 0, 1, 5, 2, 3, 4, 6, 7,  //
      2, 5, 0, 1, 3, 4, 6, 7, /**/ 0, 2, 5, 1, 3, 4, 6, 7,  //
      1, 2, 5, 0, 3, 4, 6, 7, /**/ 0, 1, 2, 5, 3, 4, 6, 7,  //
      3, 5, 0, 1, 2, 4, 6, 7, /**/ 0, 3, 5, 1, 2, 4, 6, 7,  //
      1, 3, 5, 0, 2, 4, 6, 7, /**/ 0, 1, 3, 5, 2, 4, 6, 7,  //
      2, 3, 5, 0, 1, 4, 6, 7, /**/ 0, 2, 3, 5, 1, 4, 6, 7,  //
      1, 2, 3, 5, 0, 4, 6, 7, /**/ 0, 1, 2, 3, 5, 4, 6, 7,  //
      4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 4, 5, 1, 2, 3, 6, 7,  //
      1, 4, 5, 0, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7,  //
      2, 4, 5, 0, 1, 3, 6, 7, /**/ 0, 2, 4, 5, 1, 3, 6, 7,  //
      1, 2, 4, 5, 0, 3, 6, 7, /**/ 0, 1, 2, 4, 5, 3, 6, 7,  //
      3, 4, 5, 0, 1, 2, 6, 7, /**/ 0, 3, 4, 5, 1, 2, 6, 7,  //
      1, 3, 4, 5, 0, 2, 6, 7, /**/ 0, 1, 3, 4, 5, 2, 6, 7,  //
      2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 2, 3, 4, 5, 1, 6, 7,  //
      1, 2, 3, 4, 5, 0, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
      6, 0, 1, 2, 3, 4, 5, 7, /**/ 0, 6, 1, 2, 3, 4, 5, 7,  //
      1, 6, 0, 2, 3, 4, 5, 7, /**/ 0, 1, 6, 2, 3, 4, 5, 7,  //
      2, 6, 0, 1, 3, 4, 5, 7, /**/ 0, 2, 6, 1, 3, 4, 5, 7,  //
      1, 2, 6, 0, 3, 4, 5, 7, /**/ 0, 1, 2, 6, 3, 4, 5, 7,  //
      3, 6, 0, 1, 2, 4, 5, 7, /**/ 0, 3, 6, 1, 2, 4, 5, 7,  //
      1, 3, 6, 0, 2, 4, 5, 7, /**/ 0, 1, 3, 6, 2, 4, 5, 7,  //
      2, 3, 6, 0, 1, 4, 5, 7, /**/ 0, 2, 3, 6, 1, 4, 5, 7,  //
      1, 2, 3, 6, 0, 4, 5, 7, /**/ 0, 1, 2, 3, 6, 4, 5, 7,  //
      4, 6, 0, 1, 2, 3, 5, 7, /**/ 0, 4, 6, 1, 2, 3, 5, 7,  //
      1, 4, 6, 0, 2, 3, 5, 7, /**/ 0, 1, 4, 6, 2, 3, 5, 7,  //
      2, 4, 6, 0, 1, 3, 5, 7, /**/ 0, 2, 4, 6, 1, 3, 5, 7,  //
      1, 2, 4, 6, 0, 3, 5, 7, /**/ 0, 1, 2, 4, 6, 3, 5, 7,  //
      3, 4, 6, 0, 1, 2, 5, 7, /**/ 0, 3, 4, 6, 1, 2, 5, 7,  //
      1, 3, 4, 6, 0, 2, 5, 7, /**/ 0, 1, 3, 4, 6, 2, 5, 7,  //
      2, 3, 4, 6, 0, 1, 5, 7, /**/ 0, 2, 3, 4, 6, 1, 5, 7,  //
      1, 2, 3, 4, 6, 0, 5, 7, /**/ 0, 1, 2, 3, 4, 6, 5, 7,  //
      5, 6, 0, 1, 2, 3, 4, 7, /**/ 0, 5, 6, 1, 2, 3, 4, 7,  //
      1, 5, 6, 0, 2, 3, 4, 7, /**/ 0, 1, 5, 6, 2, 3, 4, 7,  //
      2, 5, 6, 0, 1, 3, 4, 7, /**/ 0, 2, 5, 6, 1, 3, 4, 7,  //
      1, 2, 5, 6, 0, 3, 4, 7, /**/ 0, 1, 2, 5, 6, 3, 4, 7,  //
      3, 5, 6, 0, 1, 2, 4, 7, /**/ 0, 3, 5, 6, 1, 2, 4, 7,  //
      1, 3, 5, 6, 0, 2, 4, 7, /**/ 0, 1, 3, 5, 6, 2, 4, 7,  //
      2, 3, 5, 6, 0, 1, 4, 7, /**/ 0, 2, 3, 5, 6, 1, 4, 7,  //
      1, 2, 3, 5, 6, 0, 4, 7, /**/ 0, 1, 2, 3, 5, 6, 4, 7,  //
      4, 5, 6, 0, 1, 2, 3, 7, /**/ 0, 4, 5, 6, 1, 2, 3, 7,  //
      1, 4, 5, 6, 0, 2, 3, 7, /**/ 0, 1, 4, 5, 6, 2, 3, 7,  //
      2, 4, 5, 6, 0, 1, 3, 7, /**/ 0, 2, 4, 5, 6, 1, 3, 7,  //
      1, 2, 4, 5, 6, 0, 3, 7, /**/ 0, 1, 2, 4, 5, 6, 3, 7,  //
      3, 4, 5, 6, 0, 1, 2, 7, /**/ 0, 3, 4, 5, 6, 1, 2, 7,  //
      1, 3, 4, 5, 6, 0, 2, 7, /**/ 0, 1, 3, 4, 5, 6, 2, 7,  //
      2, 3, 4, 5, 6, 0, 1, 7, /**/ 0, 2, 3, 4, 5, 6, 1, 7,  //
      1, 2, 3, 4, 5, 6, 0, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
      7, 0, 1, 2, 3, 4, 5, 6, /**/ 0, 7, 1, 2, 3, 4, 5, 6,  //
      1, 7, 0, 2, 3, 4, 5, 6, /**/ 0, 1, 7, 2, 3, 4, 5, 6,  //
      2, 7, 0, 1, 3, 4, 5, 6, /**/ 0, 2, 7, 1, 3, 4, 5, 6,  //
      1, 2, 7, 0, 3, 4, 5, 6, /**/ 0, 1, 2, 7, 3, 4, 5, 6,  //
      3, 7, 0, 1, 2, 4, 5, 6, /**/ 0, 3, 7, 1, 2, 4, 5, 6,  //
      1, 3, 7, 0, 2, 4, 5, 6, /**/ 0, 1, 3, 7, 2, 4, 5, 6,  //
      2, 3, 7, 0, 1, 4, 5, 6, /**/ 0, 2, 3, 7, 1, 4, 5, 6,  //
      1, 2, 3, 7, 0, 4, 5, 6, /**/ 0, 1, 2, 3, 7, 4, 5, 6,  //
      4, 7, 0, 1, 2, 3, 5, 6, /**/ 0, 4, 7, 1, 2, 3, 5, 6,  //
      1, 4, 7, 0, 2, 3, 5, 6, /**/ 0, 1, 4, 7, 2, 3, 5, 6,  //
      2, 4, 7, 0, 1, 3, 5, 6, /**/ 0, 2, 4, 7, 1, 3, 5, 6,  //
      1, 2, 4, 7, 0, 3, 5, 6, /**/ 0, 1, 2, 4, 7, 3, 5, 6,  //
      3, 4, 7, 0, 1, 2, 5, 6, /**/ 0, 3, 4, 7, 1, 2, 5, 6,  //
      1, 3, 4, 7, 0, 2, 5, 6, /**/ 0, 1, 3, 4, 7, 2, 5, 6,  //
      2, 3, 4, 7, 0, 1, 5, 6, /**/ 0, 2, 3, 4, 7, 1, 5, 6,  //
      1, 2, 3, 4, 7, 0, 5, 6, /**/ 0, 1, 2, 3, 4, 7, 5, 6,  //
      5, 7, 0, 1, 2, 3, 4, 6, /**/ 0, 5, 7, 1, 2, 3, 4, 6,  //
      1, 5, 7, 0, 2, 3, 4, 6, /**/ 0, 1, 5, 7, 2, 3, 4, 6,  //
      2, 5, 7, 0, 1, 3, 4, 6, /**/ 0, 2, 5, 7, 1, 3, 4, 6,  //
      1, 2, 5, 7, 0, 3, 4, 6, /**/ 0, 1, 2, 5, 7, 3, 4, 6,  //
      3, 5, 7, 0, 1, 2, 4, 6, /**/ 0, 3, 5, 7, 1, 2, 4, 6,  //
      1, 3, 5, 7, 0, 2, 4, 6, /**/ 0, 1, 3, 5, 7, 2, 4, 6,  //
      2, 3, 5, 7, 0, 1, 4, 6, /**/ 0, 2, 3, 5, 7, 1, 4, 6,  //
      1, 2, 3, 5, 7, 0, 4, 6, /**/ 0, 1, 2, 3, 5, 7, 4, 6,  //
      4, 5, 7, 0, 1, 2, 3, 6, /**/ 0, 4, 5, 7, 1, 2, 3, 6,  //
      1, 4, 5, 7, 0, 2, 3, 6, /**/ 0, 1, 4, 5, 7, 2, 3, 6,  //
      2, 4, 5, 7, 0, 1, 3, 6, /**/ 0, 2, 4, 5, 7, 1, 3, 6,  //
      1, 2, 4, 5, 7, 0, 3, 6, /**/ 0, 1, 2, 4, 5, 7, 3, 6,  //
      3, 4, 5, 7, 0, 1, 2, 6, /**/ 0, 3, 4, 5, 7, 1, 2, 6,  //
      1, 3, 4, 5, 7, 0, 2, 6, /**/ 0, 1, 3, 4, 5, 7, 2, 6,  //
      2, 3, 4, 5, 7, 0, 1, 6, /**/ 0, 2, 3, 4, 5, 7, 1, 6,  //
      1, 2, 3, 4, 5, 7, 0, 6, /**/ 0, 1, 2, 3, 4, 5, 7, 6,  //
      6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 6, 7, 1, 2, 3, 4, 5,  //
      1, 6, 7, 0, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5,  //
      2, 6, 7, 0, 1, 3, 4, 5, /**/ 0, 2, 6, 7, 1, 3, 4, 5,  //
      1, 2, 6, 7, 0, 3, 4, 5, /**/ 0, 1, 2, 6, 7, 3, 4, 5,  //
      3, 6, 7, 0, 1, 2, 4, 5, /**/ 0, 3, 6, 7, 1, 2, 4, 5,  //
      1, 3, 6, 7, 0, 2, 4, 5, /**/ 0, 1, 3, 6, 7, 2, 4, 5,  //
      2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 2, 3, 6, 7, 1, 4, 5,  //
      1, 2, 3, 6, 7, 0, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5,  //
      4, 6, 7, 0, 1, 2, 3, 5, /**/ 0, 4, 6, 7, 1, 2, 3, 5,  //
      1, 4, 6, 7, 0, 2, 3, 5, /**/ 0, 1, 4, 6, 7, 2, 3, 5,  //
      2, 4, 6, 7, 0, 1, 3, 5, /**/ 0, 2, 4, 6, 7, 1, 3, 5,  //
      1, 2, 4, 6, 7, 0, 3, 5, /**/ 0, 1, 2, 4, 6, 7, 3, 5,  //
      3, 4, 6, 7, 0, 1, 2, 5, /**/ 0, 3, 4, 6, 7, 1, 2, 5,  //
      1, 3, 4, 6, 7, 0, 2, 5, /**/ 0, 1, 3, 4, 6, 7, 2, 5,  //
      2, 3, 4, 6, 7, 0, 1, 5, /**/ 0, 2, 3, 4, 6, 7, 1, 5,  //
      1, 2, 3, 4, 6, 7, 0, 5, /**/ 0, 1, 2, 3, 4, 6, 7, 5,  //
      5, 6, 7, 0, 1, 2, 3, 4, /**/ 0, 5, 6, 7, 1, 2, 3, 4,  //
      1, 5, 6, 7, 0, 2, 3, 4, /**/ 0, 1, 5, 6, 7, 2, 3, 4,  //
      2, 5, 6, 7, 0, 1, 3, 4, /**/ 0, 2, 5, 6, 7, 1, 3, 4,  //
      1, 2, 5, 6, 7, 0, 3, 4, /**/ 0, 1, 2, 5, 6, 7, 3, 4,  //
      3, 5, 6, 7, 0, 1, 2, 4, /**/ 0, 3, 5, 6, 7, 1, 2, 4,  //
      1, 3, 5, 6, 7, 0, 2, 4, /**/ 0, 1, 3, 5, 6, 7, 2, 4,  //
      2, 3, 5, 6, 7, 0, 1, 4, /**/ 0, 2, 3, 5, 6, 7, 1, 4,  //
      1, 2, 3, 5, 6, 7, 0, 4, /**/ 0, 1, 2, 3, 5, 6, 7, 4,  //
      4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 4, 5, 6, 7, 1, 2, 3,  //
      1, 4, 5, 6, 7, 0, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3,  //
      2, 4, 5, 6, 7, 0, 1, 3, /**/ 0, 2, 4, 5, 6, 7, 1, 3,  //
      1, 2, 4, 5, 6, 7, 0, 3, /**/ 0, 1, 2, 4, 5, 6, 7, 3,  //
      3, 4, 5, 6, 7, 0, 1, 2, /**/ 0, 3, 4, 5, 6, 7, 1, 2,  //
      1, 3, 4, 5, 6, 7, 0, 2, /**/ 0, 1, 3, 4, 5, 6, 7, 2,  //
      2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 2, 3, 4, 5, 6, 7, 1,  //
      1, 2, 3, 4, 5, 6, 7, 0, /**/ 0, 1, 2, 3, 4, 5, 6, 7};

  for (size_t i = 0; i < Lanes(d); i += 8) {
    // Each byte worth of bits is the index of one of 256 8-byte ranges, and its
    // population count determines how far to advance the write position.
    const size_t bits8 = bits[i / 8];
    const auto indices = Load(d8, table + bits8 * 8);
    const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices);
    StoreU(compressed, d8, pos);
    pos += PopCount(bits8);
  }
  return static_cast<size_t>(pos - unaligned);
}

template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) {
  uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)];
  (void)StoreMaskBits(d, mask, bits);
  return CompressBitsStore(v, bits, d, unaligned);
}

template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API size_t CompressBlendedStore(V v, M mask, D d,
                                    T* HWY_RESTRICT unaligned) {
  HWY_ALIGN T buf[MaxLanes(d)];
  const size_t bytes = CompressStore(v, mask, d, buf);
  BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned);
  return bytes;
}

// For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE.
template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
HWY_API V Compress(V v, const M mask) {
  const DFromV<V> d;
  HWY_ALIGN T lanes[MaxLanes(d)];
  (void)CompressStore(v, mask, d, lanes);
  return Load(d, lanes);
}

template <class V, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
  const DFromV<V> d;
  HWY_ALIGN T lanes[MaxLanes(d)];
  (void)CompressBitsStore(v, bits, d, lanes);
  return Load(d, lanes);
}

template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
HWY_API V CompressNot(V v, M mask) {
  return Compress(v, Not(mask));
}

#endif  // HWY_NATIVE_COMPRESS8

// ------------------------------ Expand

// Note that this generic implementation assumes <= 128 bit fixed vectors;
// the SVE and RVV targets provide their own native implementations.
#if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
#ifdef HWY_NATIVE_EXPAND
#undef HWY_NATIVE_EXPAND
#else
#define HWY_NATIVE_EXPAND
#endif

namespace detail {

#if HWY_IDE
template <class M>
HWY_INLINE uint64_t BitsFromMask(M /* mask */) {
  return 0;
}
#endif  // HWY_IDE

template <size_t N>
HWY_INLINE Vec128<uint8_t, N> IndicesForExpandFromBits(uint64_t mask_bits) {
  static_assert(N <= 8, "Should only be called for half-vectors");
  const Simd<uint8_t, N, 0> du8;
  HWY_DASSERT(mask_bits < 0x100);
  alignas(16) static constexpr uint8_t table[2048] = {
      // PrintExpand8x8Tables
      128, 128, 128, 128, 128, 128, 128, 128,  //
      0,   128, 128, 128, 128, 128, 128, 128,  //
      128, 0,   128, 128, 128, 128, 128, 128,  //
      0,   1,   128, 128, 128, 128, 128, 128,  //
      128, 128, 0,   128, 128, 128, 128, 128,  //
      0,   128, 1,   128, 128, 128, 128, 128,  //
      128, 0,   1,   128, 128, 128, 128, 128,  //
      0,   1,   2,   128, 128, 128, 128, 128,  //
      128, 128, 128, 0,   128, 128, 128, 128,  //
      0,   128, 128, 1,   128, 128, 128, 128,  //
      128, 0,   128, 1,   128, 128, 128, 128,  //
      0,   1,   128, 2,   128, 128, 128, 128,  //
      128, 128, 0,   1,   128, 128, 128, 128,  //
      0,   128, 1,   2,   128, 128, 128, 128,  //
      128, 0,   1,   2,   128, 128, 128, 128,  //
      0,   1,   2,   3,   128, 128, 128, 128,  //
      128, 128, 128, 128, 0,   128, 128, 128,  //
      0,   128, 128, 128, 1,   128, 128, 128,  //
      128, 0,   128, 128, 1,   128, 128, 128,  //
      0,   1,   128, 128, 2,   128, 128, 128,  //
      128, 128, 0,   128, 1,   128, 128, 128,  //
      0,   128, 1,   128, 2,   128, 128, 128,  //
      128, 0,   1,   128, 2,   128, 128, 128,  //
      0,   1,   2,   128, 3,   128, 128, 128,  //
      128, 128, 128, 0,   1,   128, 128, 128,  //
      0,   128, 128, 1,   2,   128, 128, 128,  //
      128, 0,   128, 1,   2,   128, 128, 128,  //
      0,   1,   128, 2,   3,   128, 128, 128,  //
      128, 128, 0,   1,   2,   128, 128, 128,  //
      0,   128, 1,   2,   3,   128, 128, 128,  //
      128, 0,   1,   2,   3,   128, 128, 128,  //
      0,   1,   2,   3,   4,   128, 128, 128,  //
      128, 128, 128, 128, 128, 0,   128, 128,  //
      0,   128, 128, 128, 128, 1,   128, 128,  //
      128, 0,   128, 128, 128, 1,   128, 128,  //
      0,   1,   128, 128, 128, 2,   128, 128,  //
      128, 128, 0,   128, 128, 1,   128, 128,  //
      0,   128, 1,   128, 128, 2,   128, 128,  //
      128, 0,   1,   128, 128, 2,   128, 128,  //
      0,   1,   2,   128, 128, 3,   128, 128,  //
      128, 128, 128, 0,   128, 1,   128, 128,  //
      0,   128, 128, 1,   128, 2,   128, 128,  //
      128, 0,   128, 1,   128, 2,   128, 128,  //
      0,   1,   128, 2,   128, 3,   128, 128,  //
      128, 128, 0,   1,   128, 2,   128, 128,  //
      0,   128, 1,   2,   128, 3,   128, 128,  //
      128, 0,   1,   2,   128, 3,   128, 128,  //
      0,   1,   2,   3,   128, 4,   128, 128,  //
      128, 128, 128, 128, 0,   1,   128, 128,  //
      0,   128, 128, 128, 1,   2,   128, 128,  //
      128, 0,   128, 128, 1,   2,   128, 128,  //
      0,   1,   128, 128, 2,   3,   128, 128,  //
      128, 128, 0,   128, 1,   2,   128, 128,  //
      0,   128, 1,   128, 2,   3,   128, 128,  //
      128, 0,   1,   128, 2,   3,   128, 128,  //
      0,   1,   2,   128, 3,   4,   128, 128,  //
      128, 128, 128, 0,   1,   2,   128, 128,  //
      0,   128, 128, 1,   2,   3,   128, 128,  //
      128, 0,   128, 1,   2,   3,   128, 128,  //
      0,   1,   128, 2,   3,   4,   128, 128,  //
      128, 128, 0,   1,   2,   3,   128, 128,  //
      0,   128, 1,   2,   3,   4,   128, 128,  //
      128, 0,   1,   2,   3,   4,   128, 128,  //
      0,   1,   2,   3,   4,   5,   128, 128,  //
      128, 128, 128, 128, 128, 128, 0,   128,  //
      0,   128, 128, 128, 128, 128, 1,   128,  //
      128, 0,   128, 128, 128, 128, 1,   128,  //
      0,   1,   128, 128, 128, 128, 2,   128,  //
      128, 128, 0,   128, 128, 128, 1,   128,  //
      0,   128, 1,   128, 128, 128, 2,   128,  //
      128, 0,   1,   128, 128, 128, 2,   128,  //
      0,   1,   2,   128, 128, 128, 3,   128,  //
      128, 128, 128, 0,   128, 128, 1,   128,  //
      0,   128, 128, 1,   128, 128, 2,   128,  //
      128, 0,   128, 1,   128, 128, 2,   128,  //
      0,   1,   128, 2,   128, 128, 3,   128,  //
      128, 128, 0,   1,   128, 128, 2,   128,  //
      0,   128, 1,   2,   128, 128, 3,   128,  //
      128, 0,   1,   2,   128, 128, 3,   128,  //
      0,   1,   2,   3,   128, 128, 4,   128,  //
      128, 128, 128, 128, 0,   128, 1,   128,  //
      0,   128, 128, 128, 1,   128, 2,   128,  //
      128, 0,   128, 128, 1,   128, 2,   128,  //
      0,   1,   128, 128, 2,   128, 3,   128,  //
      128, 128, 0,   128, 1,   128, 2,   128,  //
      0,   128, 1,   128, 2,   128, 3,   128,  //
      128, 0,   1,   128, 2,   128, 3,   128,  //
      0,   1,   2,   128, 3,   128, 4,   128,  //
      128, 128, 128, 0,   1,   128, 2,   128,  //
      0,   128, 128, 1,   2,   128, 3,   128,  //
      128, 0,   128, 1,   2,   128, 3,   128,  //
      0,   1,   128, 2,   3,   128, 4,   128,  //
      128, 128, 0,   1,   2,   128, 3,   128,  //
      0,   128, 1,   2,   3,   128, 4,   128,  //
      128, 0,   1,   2,   3,   128, 4,   128,  //
      0,   1,   2,   3,   4,   128, 5,   128,  //
      128, 128, 128, 128, 128, 0,   1,   128,  //
      0,   128, 128, 128, 128, 1,   2,   128,  //
      128, 0,   128, 128, 128, 1,   2,   128,  //
      0,   1,   128, 128, 128, 2,   3,   128,  //
      128, 128, 0,   128, 128, 1,   2,   128,  //
      0,   128, 1,   128, 128, 2,   3,   128,  //
      128, 0,   1,   128, 128, 2,   3,   128,  //
      0,   1,   2,   128, 128, 3,   4,   128,  //
      128, 128, 128, 0,   128, 1,   2,   128,  //
      0,   128, 128, 1,   128, 2,   3,   128,  //
      128, 0,   128, 1,   128, 2,   3,   128,  //
      0,   1,   128, 2,   128, 3,   4,   128,  //
      128, 128, 0,   1,   128, 2,   3,   128,  //
      0,   128, 1,   2,   128, 3,   4,   128,  //
      128, 0,   1,   2,   128, 3,   4,   128,  //
      0,   1,   2,   3,   128, 4,   5,   128,  //
      128, 128, 128, 128, 0,   1,   2,   128,  //
      0,   128, 128, 128, 1,   2,   3,   128,  //
      128, 0,   128, 128, 1,   2,   3,   128,  //
      0,   1,   128, 128, 2,   3,   4,   128,  //
      128, 128, 0,   128, 1,   2,   3,   128,  //
      0,   128, 1,   128, 2,   3,   4,   128,  //
      128, 0,   1,   128, 2,   3,   4,   128,  //
      0,   1,   2,   128, 3,   4,   5,   128,  //
      128, 128, 128, 0,   1,   2,   3,   128,  //
      0,   128, 128, 1,   2,   3,   4,   128,  //
      128, 0,   128, 1,   2,   3,   4,   128,  //
      0,   1,   128, 2,   3,   4,   5,   128,  //
      128, 128, 0,   1,   2,   3,   4,   128,  //
      0,   128, 1,   2,   3,   4,   5,   128,  //
      128, 0,   1,   2,   3,   4,   5,   128,  //
      0,   1,   2,   3,   4,   5,   6,   128,  //
      128, 128, 128, 128, 128, 128, 128, 0,    //
      0,   128, 128, 128, 128, 128, 128, 1,    //
      128, 0,   128, 128, 128, 128, 128, 1,    //
      0,   1,   128, 128, 128, 128, 128, 2,    //
      128, 128, 0,   128, 128, 128, 128, 1,    //
      0,   128, 1,   128, 128, 128, 128, 2,    //
      128, 0,   1,   128, 128, 128, 128, 2,    //
      0,   1,   2,   128, 128, 128, 128, 3,    //
      128, 128, 128, 0,   128, 128, 128, 1,    //
      0,   128, 128, 1,   128, 128, 128, 2,    //
      128, 0,   128, 1,   128, 128, 128, 2,    //
      0,   1,   128, 2,   128, 128, 128, 3,    //
      128, 128, 0,   1,   128, 128, 128, 2,    //
      0,   128, 1,   2,   128, 128, 128, 3,    //
      128, 0,   1,   2,   128, 128, 128, 3,    //
      0,   1,   2,   3,   128, 128, 128, 4,    //
      128, 128, 128, 128, 0,   128, 128, 1,    //
      0,   128, 128, 128, 1,   128, 128, 2,    //
      128, 0,   128, 128, 1,   128, 128, 2,    //
      0,   1,   128, 128, 2,   128, 128, 3,    //
      128, 128, 0,   128, 1,   128, 128, 2,    //
      0,   128, 1,   128, 2,   128, 128, 3,    //
      128, 0,   1,   128, 2,   128, 128, 3,    //
      0,   1,   2,   128, 3,   128, 128, 4,    //
      128, 128, 128, 0,   1,   128, 128, 2,    //
      0,   128, 128, 1,   2,   128, 128, 3,    //
      128, 0,   128, 1,   2,   128, 128, 3,    //
      0,   1,   128, 2,   3,   128, 128, 4,    //
      128, 128, 0,   1,   2,   128, 128, 3,    //
      0,   128, 1,   2,   3,   128, 128, 4,    //
      128, 0,   1,   2,   3,   128, 128, 4,    //
      0,   1,   2,   3,   4,   128, 128, 5,    //
      128, 128, 128, 128, 128, 0,   128, 1,    //
      0,   128, 128, 128, 128, 1,   128, 2,    //
      128, 0,   128, 128, 128, 1,   128, 2,    //
      0,   1,   128, 128, 128, 2,   128, 3,    //
      128, 128, 0,   128, 128, 1,   128, 2,    //
      0,   128, 1,   128, 128, 2,   128, 3,    //
      128, 0,   1,   128, 128, 2,   128, 3,    //
      0,   1,   2,   128, 128, 3,   128, 4,    //
      128, 128, 128, 0,   128, 1,   128, 2,    //
      0,   128, 128, 1,   128, 2,   128, 3,    //
      128, 0,   128, 1,   128, 2,   128, 3,    //
      0,   1,   128, 2,   128, 3,   128, 4,    //
      128, 128, 0,   1,   128, 2,   128, 3,    //
      0,   128, 1,   2,   128, 3,   128, 4,    //
      128, 0,   1,   2,   128, 3,   128, 4,    //
      0,   1,   2,   3,   128, 4,   128, 5,    //
      128, 128, 128, 128, 0,   1,   128, 2,    //
      0,   128, 128, 128, 1,   2,   128, 3,    //
      128, 0,   128, 128, 1,   2,   128, 3,    //
      0,   1,   128, 128, 2,   3,   128, 4,    //
      128, 128, 0,   128, 1,   2,   128, 3,    //
      0,   128, 1,   128, 2,   3,   128, 4,    //
      128, 0,   1,   128, 2,   3,   128, 4,    //
      0,   1,   2,   128, 3,   4,   128, 5,    //
      128, 128, 128, 0,   1,   2,   128, 3,    //
      0,   128, 128, 1,   2,   3,   128, 4,    //
      128, 0,   128, 1,   2,   3,   128, 4,    //
      0,   1,   128, 2,   3,   4,   128, 5,    //
      128, 128, 0,   1,   2,   3,   128, 4,    //
      0,   128, 1,   2,   3,   4,   128, 5,    //
      128, 0,   1,   2,   3,   4,   128, 5,    //
      0,   1,   2,   3,   4,   5,   128, 6,    //
      128, 128, 128, 128, 128, 128, 0,   1,    //
      0,   128, 128, 128, 128, 128, 1,   2,    //
      128, 0,   128, 128, 128, 128, 1,   2,    //
      0,   1,   128, 128, 128, 128, 2,   3,    //
      128, 128, 0,   128, 128, 128, 1,   2,    //
      0,   128, 1,   128, 128, 128, 2,   3,    //
      128, 0,   1,   128, 128, 128, 2,   3,    //
      0,   1,   2,   128, 128, 128, 3,   4,    //
      128, 128, 128, 0,   128, 128, 1,   2,    //
      0,   128, 128, 1,   128, 128, 2,   3,    //
      128, 0,   128, 1,   128, 128, 2,   3,    //
      0,   1,   128, 2,   128, 128, 3,   4,    //
      128, 128, 0,   1,   128, 128, 2,   3,    //
      0,   128, 1,   2,   128, 128, 3,   4,    //
      128, 0,   1,   2,   128, 128, 3,   4,    //
      0,   1,   2,   3,   128, 128, 4,   5,    //
      128, 128, 128, 128, 0,   128, 1,   2,    //
      0,   128, 128, 128, 1,   128, 2,   3,    //
      128, 0,   128, 128, 1,   128, 2,   3,    //
      0,   1,   128, 128, 2,   128, 3,   4,    //
      128, 128, 0,   128, 1,   128, 2,   3,    //
      0,   128, 1,   128, 2,   128, 3,   4,    //
      128, 0,   1,   128, 2,   128, 3,   4,    //
      0,   1,   2,   128, 3,   128, 4,   5,    //
      128, 128, 128, 0,   1,   128, 2,   3,    //
      0,   128, 128, 1,   2,   128, 3,   4,    //
      128, 0,   128, 1,   2,   128, 3,   4,    //
      0,   1,   128, 2,   3,   128, 4,   5,    //
      128, 128, 0,   1,   2,   128, 3,   4,    //
      0,   128, 1,   2,   3,   128, 4,   5,    //
      128, 0,   1,   2,   3,   128, 4,   5,    //
      0,   1,   2,   3,   4,   128, 5,   6,    //
      128, 128, 128, 128, 128, 0,   1,   2,    //
      0,   128, 128, 128, 128, 1,   2,   3,    //
      128, 0,   128, 128, 128, 1,   2,   3,    //
      0,   1,   128, 128, 128, 2,   3,   4,    //
      128, 128, 0,   128, 128, 1,   2,   3,    //
      0,   128, 1,   128, 128, 2,   3,   4,    //
      128, 0,   1,   128, 128, 2,   3,   4,    //
      0,   1,   2,   128, 128, 3,   4,   5,    //
      128, 128, 128, 0,   128, 1,   2,   3,    //
      0,   128, 128, 1,   128, 2,   3,   4,    //
      128, 0,   128, 1,   128, 2,   3,   4,    //
      0,   1,   128, 2,   128, 3,   4,   5,    //
      128, 128, 0,   1,   128, 2,   3,   4,    //
      0,   128, 1,   2,   128, 3,   4,   5,    //
      128, 0,   1,   2,   128, 3,   4,   5,    //
      0,   1,   2,   3,   128, 4,   5,   6,    //
      128, 128, 128, 128, 0,   1,   2,   3,    //
      0,   128, 128, 128, 1,   2,   3,   4,    //
      128, 0,   128, 128, 1,   2,   3,   4,    //
      0,   1,   128, 128, 2,   3,   4,   5,    //
      128, 128, 0,   128, 1,   2,   3,   4,    //
      0,   128, 1,   128, 2,   3,   4,   5,    //
      128, 0,   1,   128, 2,   3,   4,   5,    //
      0,   1,   2,   128, 3,   4,   5,   6,    //
      128, 128, 128, 0,   1,   2,   3,   4,    //
      0,   128, 128, 1,   2,   3,   4,   5,    //
      128, 0,   128, 1,   2,   3,   4,   5,    //
      0,   1,   128, 2,   3,   4,   5,   6,    //
      128, 128, 0,   1,   2,   3,   4,   5,    //
      0,   128, 1,   2,   3,   4,   5,   6,    //
      128, 0,   1,   2,   3,   4,   5,   6,    //
      0,   1,   2,   3,   4,   5,   6,   7};
  return LoadU(du8, table + mask_bits * 8);
}

}  // namespace detail

// Half vector of bytes: one table lookup
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  const DFromV<decltype(v)> d;

  const uint64_t mask_bits = detail::BitsFromMask(mask);
  const Vec128<uint8_t, N> indices =
      detail::IndicesForExpandFromBits<N>(mask_bits);
  return BitCast(d, TableLookupBytesOr0(v, indices));
}

// Full vector of bytes: two table lookups
template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
  const Full128<T> d;
  const RebindToUnsigned<decltype(d)> du;
  const Half<decltype(du)> duh;
  const Vec128<uint8_t> vu = BitCast(du, v);

  const uint64_t mask_bits = detail::BitsFromMask(mask);
  const uint64_t maskL = mask_bits & 0xFF;
  const uint64_t maskH = mask_bits >> 8;

  // We want to skip past the v bytes already consumed by idxL. There is no
  // instruction for shift-reg by variable bytes. Storing v itself would work
  // but would involve a store-load forwarding stall. We instead shuffle using
  // loaded indices. multishift_epi64_epi8 would also help, but if we have that,
  // we probably also have native 8-bit Expand.
  alignas(16) static constexpr uint8_t iota[32] = {
      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,
      11,  12,  13,  14,  15,  128, 128, 128, 128, 128, 128,
      128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
  const VFromD<decltype(du)> shift = LoadU(du, iota + PopCount(maskL));
  const VFromD<decltype(duh)> vL = LowerHalf(duh, vu);
  const VFromD<decltype(duh)> vH =
      LowerHalf(duh, TableLookupBytesOr0(vu, shift));

  const VFromD<decltype(duh)> idxL = detail::IndicesForExpandFromBits<8>(maskL);
  const VFromD<decltype(duh)> idxH = detail::IndicesForExpandFromBits<8>(maskH);

  const VFromD<decltype(duh)> expandL = TableLookupBytesOr0(vL, idxL);
  const VFromD<decltype(duh)> expandH = TableLookupBytesOr0(vH, idxH);
  return BitCast(d, Combine(du, expandH, expandL));
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;

  const Rebind<uint8_t, decltype(d)> du8;
  const uint64_t mask_bits = detail::BitsFromMask(mask);

  // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply
  // the nibble trick used below because not all indices fit within one lane.
  alignas(16) static constexpr uint8_t table[2048] = {
      // PrintExpand16x8ByteTables
      128, 128, 128, 128, 128, 128, 128, 128,  //
      0,   128, 128, 128, 128, 128, 128, 128,  //
      128, 0,   128, 128, 128, 128, 128, 128,  //
      0,   2,   128, 128, 128, 128, 128, 128,  //
      128, 128, 0,   128, 128, 128, 128, 128,  //
      0,   128, 2,   128, 128, 128, 128, 128,  //
      128, 0,   2,   128, 128, 128, 128, 128,  //
      0,   2,   4,   128, 128, 128, 128, 128,  //
      128, 128, 128, 0,   128, 128, 128, 128,  //
      0,   128, 128, 2,   128, 128, 128, 128,  //
      128, 0,   128, 2,   128, 128, 128, 128,  //
      0,   2,   128, 4,   128, 128, 128, 128,  //
      128, 128, 0,   2,   128, 128, 128, 128,  //
      0,   128, 2,   4,   128, 128, 128, 128,  //
      128, 0,   2,   4,   128, 128, 128, 128,  //
      0,   2,   4,   6,   128, 128, 128, 128,  //
      128, 128, 128, 128, 0,   128, 128, 128,  //
      0,   128, 128, 128, 2,   128, 128, 128,  //
      128, 0,   128, 128, 2,   128, 128, 128,  //
      0,   2,   128, 128, 4,   128, 128, 128,  //
      128, 128, 0,   128, 2,   128, 128, 128,  //
      0,   128, 2,   128, 4,   128, 128, 128,  //
      128, 0,   2,   128, 4,   128, 128, 128,  //
      0,   2,   4,   128, 6,   128, 128, 128,  //
      128, 128, 128, 0,   2,   128, 128, 128,  //
      0,   128, 128, 2,   4,   128, 128, 128,  //
      128, 0,   128, 2,   4,   128, 128, 128,  //
      0,   2,   128, 4,   6,   128, 128, 128,  //
      128, 128, 0,   2,   4,   128, 128, 128,  //
      0,   128, 2,   4,   6,   128, 128, 128,  //
      128, 0,   2,   4,   6,   128, 128, 128,  //
      0,   2,   4,   6,   8,   128, 128, 128,  //
      128, 128, 128, 128, 128, 0,   128, 128,  //
      0,   128, 128, 128, 128, 2,   128, 128,  //
      128, 0,   128, 128, 128, 2,   128, 128,  //
      0,   2,   128, 128, 128, 4,   128, 128,  //
      128, 128, 0,   128, 128, 2,   128, 128,  //
      0,   128, 2,   128, 128, 4,   128, 128,  //
      128, 0,   2,   128, 128, 4,   128, 128,  //
      0,   2,   4,   128, 128, 6,   128, 128,  //
      128, 128, 128, 0,   128, 2,   128, 128,  //
      0,   128, 128, 2,   128, 4,   128, 128,  //
      128, 0,   128, 2,   128, 4,   128, 128,  //
      0,   2,   128, 4,   128, 6,   128, 128,  //
      128, 128, 0,   2,   128, 4,   128, 128,  //
      0,   128, 2,   4,   128, 6,   128, 128,  //
      128, 0,   2,   4,   128, 6,   128, 128,  //
      0,   2,   4,   6,   128, 8,   128, 128,  //
      128, 128, 128, 128, 0,   2,   128, 128,  //
      0,   128, 128, 128, 2,   4,   128, 128,  //
      128, 0,   128, 128, 2,   4,   128, 128,  //
      0,   2,   128, 128, 4,   6,   128, 128,  //
      128, 128, 0,   128, 2,   4,   128, 128,  //
      0,   128, 2,   128, 4,   6,   128, 128,  //
      128, 0,   2,   128, 4,   6,   128, 128,  //
      0,   2,   4,   128, 6,   8,   128, 128,  //
      128, 128, 128, 0,   2,   4,   128, 128,  //
      0,   128, 128, 2,   4,   6,   128, 128,  //
      128, 0,   128, 2,   4,   6,   128, 128,  //
      0,   2,   128, 4,   6,   8,   128, 128,  //
      128, 128, 0,   2,   4,   6,   128, 128,  //
      0,   128, 2,   4,   6,   8,   128, 128,  //
      128, 0,   2,   4,   6,   8,   128, 128,  //
      0,   2,   4,   6,   8,   10,  128, 128,  //
      128, 128, 128, 128, 128, 128, 0,   128,  //
      0,   128, 128, 128, 128, 128, 2,   128,  //
      128, 0,   128, 128, 128, 128, 2,   128,  //
      0,   2,   128, 128, 128, 128, 4,   128,  //
      128, 128, 0,   128, 128, 128, 2,   128,  //
      0,   128, 2,   128, 128, 128, 4,   128,  //
      128, 0,   2,   128, 128, 128, 4,   128,  //
      0,   2,   4,   128, 128, 128, 6,   128,  //
      128, 128, 128, 0,   128, 128, 2,   128,  //
      0,   128, 128, 2,   128, 128, 4,   128,  //
      128, 0,   128, 2,   128, 128, 4,   128,  //
      0,   2,   128, 4,   128, 128, 6,   128,  //
      128, 128, 0,   2,   128, 128, 4,   128,  //
      0,   128, 2,   4,   128, 128, 6,   128,  //
      128, 0,   2,   4,   128, 128, 6,   128,  //
      0,   2,   4,   6,   128, 128, 8,   128,  //
      128, 128, 128, 128, 0,   128, 2,   128,  //
      0,   128, 128, 128, 2,   128, 4,   128,  //
      128, 0,   128, 128, 2,   128, 4,   128,  //
      0,   2,   128, 128, 4,   128, 6,   128,  //
      128, 128, 0,   128, 2,   128, 4,   128,  //
      0,   128, 2,   128, 4,   128, 6,   128,  //
      128, 0,   2,   128, 4,   128, 6,   128,  //
      0,   2,   4,   128, 6,   128, 8,   128,  //
      128, 128, 128, 0,   2,   128, 4,   128,  //
      0,   128, 128, 2,   4,   128, 6,   128,  //
      128, 0,   128, 2,   4,   128, 6,   128,  //
      0,   2,   128, 4,   6,   128, 8,   128,  //
      128, 128, 0,   2,   4,   128, 6,   128,  //
      0,   128, 2,   4,   6,   128, 8,   128,  //
      128, 0,   2,   4,   6,   128, 8,   128,  //
      0,   2,   4,   6,   8,   128, 10,  128,  //
      128, 128, 128, 128, 128, 0,   2,   128,  //
      0,   128, 128, 128, 128, 2,   4,   128,  //
      128, 0,   128, 128, 128, 2,   4,   128,  //
      0,   2,   128, 128, 128, 4,   6,   128,  //
      128, 128, 0,   128, 128, 2,   4,   128,  //
      0,   128, 2,   128, 128, 4,   6,   128,  //
      128, 0,   2,   128, 128, 4,   6,   128,  //
      0,   2,   4,   128, 128, 6,   8,   128,  //
      128, 128, 128, 0,   128, 2,   4,   128,  //
      0,   128, 128, 2,   128, 4,   6,   128,  //
      128, 0,   128, 2,   128, 4,   6,   128,  //
      0,   2,   128, 4,   128, 6,   8,   128,  //
      128, 128, 0,   2,   128, 4,   6,   128,  //
      0,   128, 2,   4,   128, 6,   8,   128,  //
      128, 0,   2,   4,   128, 6,   8,   128,  //
      0,   2,   4,   6,   128, 8,   10,  128,  //
      128, 128, 128, 128, 0,   2,   4,   128,  //
      0,   128, 128, 128, 2,   4,   6,   128,  //
      128, 0,   128, 128, 2,   4,   6,   128,  //
      0,   2,   128, 128, 4,   6,   8,   128,  //
      128, 128, 0,   128, 2,   4,   6,   128,  //
      0,   128, 2,   128, 4,   6,   8,   128,  //
      128, 0,   2,   128, 4,   6,   8,   128,  //
      0,   2,   4,   128, 6,   8,   10,  128,  //
      128, 128, 128, 0,   2,   4,   6,   128,  //
      0,   128, 128, 2,   4,   6,   8,   128,  //
      128, 0,   128, 2,   4,   6,   8,   128,  //
      0,   2,   128, 4,   6,   8,   10,  128,  //
      128, 128, 0,   2,   4,   6,   8,   128,  //
      0,   128, 2,   4,   6,   8,   10,  128,  //
      128, 0,   2,   4,   6,   8,   10,  128,  //
      0,   2,   4,   6,   8,   10,  12,  128,  //
      128, 128, 128, 128, 128, 128, 128, 0,    //
      0,   128, 128, 128, 128, 128, 128, 2,    //
      128, 0,   128, 128, 128, 128, 128, 2,    //
      0,   2,   128, 128, 128, 128, 128, 4,    //
      128, 128, 0,   128, 128, 128, 128, 2,    //
      0,   128, 2,   128, 128, 128, 128, 4,    //
      128, 0,   2,   128, 128, 128, 128, 4,    //
      0,   2,   4,   128, 128, 128, 128, 6,    //
      128, 128, 128, 0,   128, 128, 128, 2,    //
      0,   128, 128, 2,   128, 128, 128, 4,    //
      128, 0,   128, 2,   128, 128, 128, 4,    //
      0,   2,   128, 4,   128, 128, 128, 6,    //
      128, 128, 0,   2,   128, 128, 128, 4,    //
      0,   128, 2,   4,   128, 128, 128, 6,    //
      128, 0,   2,   4,   128, 128, 128, 6,    //
      0,   2,   4,   6,   128, 128, 128, 8,    //
      128, 128, 128, 128, 0,   128, 128, 2,    //
      0,   128, 128, 128, 2,   128, 128, 4,    //
      128, 0,   128, 128, 2,   128, 128, 4,    //
      0,   2,   128, 128, 4,   128, 128, 6,    //
      128, 128, 0,   128, 2,   128, 128, 4,    //
      0,   128, 2,   128, 4,   128, 128, 6,    //
      128, 0,   2,   128, 4,   128, 128, 6,    //
      0,   2,   4,   128, 6,   128, 128, 8,    //
      128, 128, 128, 0,   2,   128, 128, 4,    //
      0,   128, 128, 2,   4,   128, 128, 6,    //
      128, 0,   128, 2,   4,   128, 128, 6,    //
      0,   2,   128, 4,   6,   128, 128, 8,    //
      128, 128, 0,   2,   4,   128, 128, 6,    //
      0,   128, 2,   4,   6,   128, 128, 8,    //
      128, 0,   2,   4,   6,   128, 128, 8,    //
      0,   2,   4,   6,   8,   128, 128, 10,   //
      128, 128, 128, 128, 128, 0,   128, 2,    //
      0,   128, 128, 128, 128, 2,   128, 4,    //
      128, 0,   128, 128, 128, 2,   128, 4,    //
      0,   2,   128, 128, 128, 4,   128, 6,    //
      128, 128, 0,   128, 128, 2,   128, 4,    //
      0,   128, 2,   128, 128, 4,   128, 6,    //
      128, 0,   2,   128, 128, 4,   128, 6,    //
      0,   2,   4,   128, 128, 6,   128, 8,    //
      128, 128, 128, 0,   128, 2,   128, 4,    //
      0,   128, 128, 2,   128, 4,   128, 6,    //
      128, 0,   128, 2,   128, 4,   128, 6,    //
      0,   2,   128, 4,   128, 6,   128, 8,    //
      128, 128, 0,   2,   128, 4,   128, 6,    //
      0,   128, 2,   4,   128, 6,   128, 8,    //
      128, 0,   2,   4,   128, 6,   128, 8,    //
      0,   2,   4,   6,   128, 8,   128, 10,   //
      128, 128, 128, 128, 0,   2,   128, 4,    //
      0,   128, 128, 128, 2,   4,   128, 6,    //
      128, 0,   128, 128, 2,   4,   128, 6,    //
      0,   2,   128, 128, 4,   6,   128, 8,    //
      128, 128, 0,   128, 2,   4,   128, 6,    //
      0,   128, 2,   128, 4,   6,   128, 8,    //
      128, 0,   2,   128, 4,   6,   128, 8,    //
      0,   2,   4,   128, 6,   8,   128, 10,   //
      128, 128, 128, 0,   2,   4,   128, 6,    //
      0,   128, 128, 2,   4,   6,   128, 8,    //
      128, 0,   128, 2,   4,   6,   128, 8,    //
      0,   2,   128, 4,   6,   8,   128, 10,   //
      128, 128, 0,   2,   4,   6,   128, 8,    //
      0,   128, 2,   4,   6,   8,   128, 10,   //
      128, 0,   2,   4,   6,   8,   128, 10,   //
      0,   2,   4,   6,   8,   10,  128, 12,   //
      128, 128, 128, 128, 128, 128, 0,   2,    //
      0,   128, 128, 128, 128, 128, 2,   4,    //
      128, 0,   128, 128, 128, 128, 2,   4,    //
      0,   2,   128, 128, 128, 128, 4,   6,    //
      128, 128, 0,   128, 128, 128, 2,   4,    //
      0,   128, 2,   128, 128, 128, 4,   6,    //
      128, 0,   2,   128, 128, 128, 4,   6,    //
      0,   2,   4,   128, 128, 128, 6,   8,    //
      128, 128, 128, 0,   128, 128, 2,   4,    //
      0,   128, 128, 2,   128, 128, 4,   6,    //
      128, 0,   128, 2,   128, 128, 4,   6,    //
      0,   2,   128, 4,   128, 128, 6,   8,    //
      128, 128, 0,   2,   128, 128, 4,   6,    //
      0,   128, 2,   4,   128, 128, 6,   8,    //
      128, 0,   2,   4,   128, 128, 6,   8,    //
      0,   2,   4,   6,   128, 128, 8,   10,   //
      128, 128, 128, 128, 0,   128, 2,   4,    //
      0,   128, 128, 128, 2,   128, 4,   6,    //
      128, 0,   128, 128, 2,   128, 4,   6,    //
      0,   2,   128, 128, 4,   128, 6,   8,    //
      128, 128, 0,   128, 2,   128, 4,   6,    //
      0,   128, 2,   128, 4,   128, 6,   8,    //
      128, 0,   2,   128, 4,   128, 6,   8,    //
      0,   2,   4,   128, 6,   128, 8,   10,   //
      128, 128, 128, 0,   2,   128, 4,   6,    //
      0,   128, 128, 2,   4,   128, 6,   8,    //
      128, 0,   128, 2,   4,   128, 6,   8,    //
      0,   2,   128, 4,   6,   128, 8,   10,   //
      128, 128, 0,   2,   4,   128, 6,   8,    //
      0,   128, 2,   4,   6,   128, 8,   10,   //
      128, 0,   2,   4,   6,   128, 8,   10,   //
      0,   2,   4,   6,   8,   128, 10,  12,   //
      128, 128, 128, 128, 128, 0,   2,   4,    //
      0,   128, 128, 128, 128, 2,   4,   6,    //
      128, 0,   128, 128, 128, 2,   4,   6,    //
      0,   2,   128, 128, 128, 4,   6,   8,    //
      128, 128, 0,   128, 128, 2,   4,   6,    //
      0,   128, 2,   128, 128, 4,   6,   8,    //
      128, 0,   2,   128, 128, 4,   6,   8,    //
      0,   2,   4,   128, 128, 6,   8,   10,   //
      128, 128, 128, 0,   128, 2,   4,   6,    //
      0,   128, 128, 2,   128, 4,   6,   8,    //
      128, 0,   128, 2,   128, 4,   6,   8,    //
      0,   2,   128, 4,   128, 6,   8,   10,   //
      128, 128, 0,   2,   128, 4,   6,   8,    //
      0,   128, 2,   4,   128, 6,   8,   10,   //
      128, 0,   2,   4,   128, 6,   8,   10,   //
      0,   2,   4,   6,   128, 8,   10,  12,   //
      128, 128, 128, 128, 0,   2,   4,   6,    //
      0,   128, 128, 128, 2,   4,   6,   8,    //
      128, 0,   128, 128, 2,   4,   6,   8,    //
      0,   2,   128, 128, 4,   6,   8,   10,   //
      128, 128, 0,   128, 2,   4,   6,   8,    //
      0,   128, 2,   128, 4,   6,   8,   10,   //
      128, 0,   2,   128, 4,   6,   8,   10,   //
      0,   2,   4,   128, 6,   8,   10,  12,   //
      128, 128, 128, 0,   2,   4,   6,   8,    //
      0,   128, 128, 2,   4,   6,   8,   10,   //
      128, 0,   128, 2,   4,   6,   8,   10,   //
      0,   2,   128, 4,   6,   8,   10,  12,   //
      128, 128, 0,   2,   4,   6,   8,   10,   //
      0,   128, 2,   4,   6,   8,   10,  12,   //
      128, 0,   2,   4,   6,   8,   10,  12,   //
      0,   2,   4,   6,   8,   10,  12,  14};
  // Extend to double length because InterleaveLower will only use the (valid)
  // lower half, and we want N u16.
  const Twice<decltype(du8)> du8x2;
  const Vec128<uint8_t, 2 * N> indices8 =
      ZeroExtendVector(du8x2, Load(du8, table + mask_bits * 8));
  const Vec128<uint16_t, N> indices16 =
      BitCast(du, InterleaveLower(du8x2, indices8, indices8));
  // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte
  // indices, add 0 to even and 1 to odd byte lanes.
  const Vec128<uint16_t, N> byte_indices = Add(
      indices16,
      Set(du, static_cast<uint16_t>(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001)));
  return BitCast(d, TableLookupBytesOr0(v, byte_indices));
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;

  const uint64_t mask_bits = detail::BitsFromMask(mask);

  alignas(16) static constexpr uint32_t packed_array[16] = {
      // PrintExpand64x4Nibble - same for 32x4.
      0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
      0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
      0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};

  // For lane i, shift the i-th 4-bit index down to bits [0, 2).
  const Vec128<uint32_t, N> packed = Set(du, packed_array[mask_bits]);
  alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12};
  Vec128<uint32_t, N> indices = packed >> Load(du, shifts);
  // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec
  // checks bounds, so clear the upper bits.
  indices = And(indices, Set(du, N - 1));
  const Vec128<uint32_t, N> expand =
      TableLookupLanes(BitCast(du, v), IndicesFromVec(du, indices));
  // TableLookupLanes cannot also zero masked-off lanes, so do that now.
  return IfThenElseZero(mask, BitCast(d, expand));
}

template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
  // Same as Compress, just zero out the mask=false lanes.
  return IfThenElseZero(mask, Compress(v, mask));
}

// For single-element vectors, this is at least as fast as native.
template <typename T>
HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) {
  return IfThenElseZero(mask, v);
}

// ------------------------------ LoadExpand
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
                             const TFromD<D>* HWY_RESTRICT unaligned) {
  return Expand(LoadU(d, unaligned), mask);
}

#endif  // HWY_NATIVE_EXPAND

// ------------------------------ TwoTablesLookupLanes

template <class D>
using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned<D>())));

// RVV/SVE have their own implementations of
// TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, IndicesFromD<D> idx)
#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE &&      \
    HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
    HWY_TARGET != HWY_SVE2_128
template <class D>
HWY_API VFromD<D> TwoTablesLookupLanes(D /*d*/, VFromD<D> a, VFromD<D> b,
                                       IndicesFromD<D> idx) {
  return TwoTablesLookupLanes(a, b, idx);
}
#endif

// ------------------------------ Reverse2, Reverse4, Reverse8 (8-bit)

#if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
#ifdef HWY_NATIVE_REVERSE2_8
#undef HWY_NATIVE_REVERSE2_8
#else
#define HWY_NATIVE_REVERSE2_8
#endif

#undef HWY_PREFER_ROTATE
// Platforms on which RotateRight is likely faster than TableLookupBytes.
// RVV and SVE anyway have their own implementation of this.
#if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \
    HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8
#define HWY_PREFER_ROTATE 1
#else
#define HWY_PREFER_ROTATE 0
#endif

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
  // Exclude AVX3 because its 16-bit RotateRight is actually 3 instructions.
#if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3
  const Repartition<uint16_t, decltype(d)> du16;
  return BitCast(d, RotateRight<8>(BitCast(du16, v)));
#else
  const VFromD<D> shuffle = Dup128VecFromValues(d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
                                                11, 10, 13, 12, 15, 14);
  return TableLookupBytes(v, shuffle);
#endif
}

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
#if HWY_PREFER_ROTATE
  const Repartition<uint16_t, decltype(d)> du16;
  return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v))));
#else
  const Repartition<uint8_t, decltype(d)> du8;
  const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
      du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
  return TableLookupBytes(v, BitCast(d, shuffle));
#endif
}

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
#if HWY_PREFER_ROTATE
  const Repartition<uint32_t, D> du32;
  return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v))));
#else
  const Repartition<uint8_t, decltype(d)> du8;
  const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
      du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
  return TableLookupBytes(v, BitCast(d, shuffle));
#endif
}

#endif  // HWY_NATIVE_REVERSE2_8

// ------------------------------ ReverseLaneBytes

#if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
#undef HWY_NATIVE_REVERSE_LANE_BYTES
#else
#define HWY_NATIVE_REVERSE_LANE_BYTES
#endif

template <class V, HWY_IF_T_SIZE_V(V, 2)>
HWY_API V ReverseLaneBytes(V v) {
  const DFromV<V> d;
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, Reverse2(du8, BitCast(du8, v)));
}

template <class V, HWY_IF_T_SIZE_V(V, 4)>
HWY_API V ReverseLaneBytes(V v) {
  const DFromV<V> d;
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, Reverse4(du8, BitCast(du8, v)));
}

template <class V, HWY_IF_T_SIZE_V(V, 8)>
HWY_API V ReverseLaneBytes(V v) {
  const DFromV<V> d;
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, Reverse8(du8, BitCast(du8, v)));
}

#endif  // HWY_NATIVE_REVERSE_LANE_BYTES

// ------------------------------ ReverseBits

// On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore
// require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit
// shifts because those would add extra masking already taken care of by
// UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to
// implement ReverseBits, so this code is not used there.
#undef HWY_REVERSE_BITS_MIN_BYTES
#if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \
     HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256)
#define HWY_REVERSE_BITS_MIN_BYTES 2
#else
#define HWY_REVERSE_BITS_MIN_BYTES 1
#endif

#if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REVERSE_BITS_UI8
#undef HWY_NATIVE_REVERSE_BITS_UI8
#else
#define HWY_NATIVE_REVERSE_BITS_UI8
#endif

namespace detail {

template <int kShiftAmt, int kShrResultMask, class V,
          HWY_IF_V_SIZE_GT_D(DFromV<V>, HWY_REVERSE_BITS_MIN_BYTES - 1)>
HWY_INLINE V UI8ReverseBitsStep(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
#if HWY_REVERSE_BITS_MIN_BYTES == 2
  const Repartition<uint16_t, decltype(d)> d_shift;
#else
  const RebindToUnsigned<decltype(d)> d_shift;
#endif

  const auto v_to_shift = BitCast(d_shift, v);
  const auto shl_result = BitCast(d, ShiftLeft<kShiftAmt>(v_to_shift));
  const auto shr_result = BitCast(d, ShiftRight<kShiftAmt>(v_to_shift));
  const auto shr_result_mask =
      BitCast(d, Set(du, static_cast<uint8_t>(kShrResultMask)));
  return Or(And(shr_result, shr_result_mask),
            AndNot(shr_result_mask, shl_result));
}

#if HWY_REVERSE_BITS_MIN_BYTES == 2
template <int kShiftAmt, int kShrResultMask, class V,
          HWY_IF_V_SIZE_D(DFromV<V>, 1)>
HWY_INLINE V UI8ReverseBitsStep(V v) {
  return V{UI8ReverseBitsStep<kShiftAmt, kShrResultMask>(Vec128<uint8_t>{v.raw})
               .raw};
}
#endif

}  // namespace detail

template <class V, HWY_IF_T_SIZE_V(V, 1)>
HWY_API V ReverseBits(V v) {
  auto result = detail::UI8ReverseBitsStep<1, 0x55>(v);
  result = detail::UI8ReverseBitsStep<2, 0x33>(result);
  result = detail::UI8ReverseBitsStep<4, 0x0F>(result);
  return result;
}

#endif  // HWY_NATIVE_REVERSE_BITS_UI8

#if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
#else
#define HWY_NATIVE_REVERSE_BITS_UI16_32_64
#endif

template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8)),
          HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V ReverseBits(V v) {
  const DFromV<decltype(v)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  return ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v))));
}
#endif  // HWY_NATIVE_REVERSE_BITS_UI16_32_64

// ------------------------------ Per4LaneBlockShuffle

#if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#else
#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#endif

#if HWY_TARGET != HWY_SCALAR
namespace detail {

template <class D>
HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
                                             const uint32_t x2,
                                             const uint32_t x1,
                                             const uint32_t x0) {
#if HWY_TARGET == HWY_RVV
  constexpr int kPow2 = d.Pow2();
  constexpr int kLoadPow2 = HWY_MAX(kPow2, -1);
  const ScalableTag<uint32_t, kLoadPow2> d_load;
#else
  constexpr size_t kMaxBytes = d.MaxBytes();
#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
  constexpr size_t kMinLanesToLoad = 2;
#else
  constexpr size_t kMinLanesToLoad = 4;
#endif
  constexpr size_t kNumToLoad =
      HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad);
  const CappedTag<uint32_t, kNumToLoad> d_load;
#endif
  return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3));
}

}  // namespace detail
#endif

#endif  // HWY_NATIVE_PER4LANEBLKSHUF_DUP32

#if HWY_TARGET != HWY_SCALAR
namespace detail {

template <class V>
HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<0> /*idx_10_tag*/, V v) {
  return DupEven(v);
}

template <class V>
HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<1> /*idx_10_tag*/, V v) {
  const DFromV<decltype(v)> d;
  return Reverse2(d, v);
}

template <class V>
HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<2> /*idx_10_tag*/, V v) {
  return v;
}

template <class V>
HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<3> /*idx_10_tag*/, V v) {
  return DupOdd(v);
}

HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(const uint32_t idx3,
                                           const uint32_t idx2,
                                           const uint32_t idx1,
                                           const uint32_t idx0) {
#if HWY_IS_LITTLE_ENDIAN
  return static_cast<uint32_t>((idx3 << 24) | (idx2 << 16) | (idx1 << 8) |
                               idx0);
#else
  return static_cast<uint32_t>(idx3 | (idx2 << 8) | (idx1 << 16) |
                               (idx0 << 24));
#endif
}

template <class D>
HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3,
                                                 const uint32_t idx2,
                                                 const uint32_t idx1,
                                                 const uint32_t idx0) {
#if HWY_TARGET == HWY_RVV
  const AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
#else
  const Repartition<uint32_t, D> du32;
#endif

  return ResizeBitCast(
      d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0)));
}

#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
    HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_EMU128
#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr
#else
#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8)

template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V idx) {
  const DFromV<decltype(v)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, TableLookupBytes(BitCast(du8, v), BitCast(du8, idx)));
}

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
                                              const uint32_t idx2,
                                              const uint32_t idx1,
                                              const uint32_t idx0) {
  const Repartition<uint32_t, decltype(d)> du32;
  const uint32_t idx3210 = U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0);
  const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
      du32, static_cast<uint32_t>(idx3210 + 0x0C0C0C0C),
      static_cast<uint32_t>(idx3210 + 0x08080808),
      static_cast<uint32_t>(idx3210 + 0x04040404),
      static_cast<uint32_t>(idx3210));
  return ResizeBitCast(d, v_byte_idx);
}

template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
                                              const uint32_t idx2,
                                              const uint32_t idx1,
                                              const uint32_t idx0) {
  const Repartition<uint32_t, decltype(d)> du32;
#if HWY_IS_LITTLE_ENDIAN
  const uint32_t idx10 = static_cast<uint32_t>((idx1 << 16) | idx0);
  const uint32_t idx32 = static_cast<uint32_t>((idx3 << 16) | idx2);
  constexpr uint32_t kLaneByteOffsets{0x01000100};
#else
  const uint32_t idx10 = static_cast<uint32_t>(idx1 | (idx0 << 16));
  const uint32_t idx32 = static_cast<uint32_t>(idx3 | (idx2 << 16));
  constexpr uint32_t kLaneByteOffsets{0x00010001};
#endif
  constexpr uint32_t kHiLaneByteOffsets{kLaneByteOffsets + 0x08080808u};

  const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
      du32, static_cast<uint32_t>(idx32 * 0x0202u + kHiLaneByteOffsets),
      static_cast<uint32_t>(idx10 * 0x0202u + kHiLaneByteOffsets),
      static_cast<uint32_t>(idx32 * 0x0202u + kLaneByteOffsets),
      static_cast<uint32_t>(idx10 * 0x0202u + kLaneByteOffsets));
  return ResizeBitCast(d, v_byte_idx);
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
                                              const uint32_t idx2,
                                              const uint32_t idx1,
                                              const uint32_t idx0) {
  const Repartition<uint32_t, decltype(d)> du32;
#if HWY_IS_LITTLE_ENDIAN
  constexpr uint32_t kLaneByteOffsets{0x03020100};
#else
  constexpr uint32_t kLaneByteOffsets{0x00010203};
#endif

  const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
      du32, static_cast<uint32_t>(idx3 * 0x04040404u + kLaneByteOffsets),
      static_cast<uint32_t>(idx2 * 0x04040404u + kLaneByteOffsets),
      static_cast<uint32_t>(idx1 * 0x04040404u + kLaneByteOffsets),
      static_cast<uint32_t>(idx0 * 0x04040404u + kLaneByteOffsets));
  return ResizeBitCast(d, v_byte_idx);
}
#endif

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
                                                  const uint32_t idx2,
                                                  const uint32_t idx1,
                                                  const uint32_t idx0) {
  return TblLookupPer4LaneBlkU8IdxInBlk(d, idx3, idx2, idx1, idx0);
}

#if HWY_TARGET == HWY_RVV
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
                                                  const uint32_t idx2,
                                                  const uint32_t idx1,
                                                  const uint32_t idx0) {
  const Rebind<uint8_t, decltype(d)> du8;
  return PromoteTo(d,
                   TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0));
}
#else
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
                                                  const uint32_t idx2,
                                                  const uint32_t idx1,
                                                  const uint32_t idx0) {
  const uint16_t u16_idx0 = static_cast<uint16_t>(idx0);
  const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
  const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
  const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
  constexpr size_t kMinLanesToLoad = 4;
#else
  constexpr size_t kMinLanesToLoad = 8;
#endif
  constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad);
  const CappedTag<uint16_t, kNumToLoad> d_load;
  return ResizeBitCast(
      d, Dup128VecFromValues(d_load, u16_idx0, u16_idx1, u16_idx2, u16_idx3,
                             u16_idx0, u16_idx1, u16_idx2, u16_idx3));
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
                                                  const uint32_t idx2,
                                                  const uint32_t idx1,
                                                  const uint32_t idx0) {
  return Per4LaneBlkShufDupSet4xU32(d, idx3, idx2, idx1, idx0);
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
                                                  const uint32_t idx2,
                                                  const uint32_t idx1,
                                                  const uint32_t idx0) {
  const RebindToUnsigned<decltype(d)> du;
  const Rebind<uint32_t, decltype(d)> du32;
  return BitCast(d, PromoteTo(du, Per4LaneBlkShufDupSet4xU32(du32, idx3, idx2,
                                                             idx1, idx0)));
}
#endif

template <class D, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D)>
HWY_INLINE IndicesFromD<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
                                                       const uint32_t idx2,
                                                       const uint32_t idx1,
                                                       const uint32_t idx0) {
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;
  auto idx_in_blk = TblLookupPer4LaneBlkIdxInBlk(du, idx3, idx2, idx1, idx0);

  constexpr size_t kN = HWY_MAX_LANES_D(D);
  if (kN < 4) {
    idx_in_blk = And(idx_in_blk, Set(du, static_cast<TU>(kN - 1)));
  }

#if HWY_TARGET == HWY_RVV
  const auto blk_offsets = AndS(Iota0(du), static_cast<TU>(~TU{3}));
#else
  const auto blk_offsets =
      And(Iota(du, TU{0}), Set(du, static_cast<TU>(~TU{3})));
#endif
  return IndicesFromVec(d, Add(idx_in_blk, blk_offsets));
}

template <class V, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(DFromV<V>)>
HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, IndicesFromD<DFromV<V>> idx) {
  return TableLookupLanes(v, idx);
}

#undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE

template <class V>
HWY_INLINE V TblLookupPer4LaneBlkShuf(V v, size_t idx3210) {
  const DFromV<decltype(v)> d;
  const uint32_t idx3 = static_cast<uint32_t>((idx3210 >> 6) & 3);
  const uint32_t idx2 = static_cast<uint32_t>((idx3210 >> 4) & 3);
  const uint32_t idx1 = static_cast<uint32_t>((idx3210 >> 2) & 3);
  const uint32_t idx0 = static_cast<uint32_t>(idx3210 & 3);
  const auto idx = TblLookupPer4LaneBlkShufIdx(d, idx3, idx2, idx1, idx0);
  return Per4LaneBlkShufDoTblLookup(v, idx);
}

// The detail::Per4LaneBlockShuffle overloads that have the extra lane_size_tag
// and vect_size_tag parameters are only called for vectors that have at
// least 4 lanes (or scalable vectors that might possibly have 4 or more lanes)
template <size_t kIdx3210, size_t kLaneSize, size_t kVectSize, class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
                                  hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
                                  hwy::SizeTag<kVectSize> /*vect_size_tag*/,
                                  V v) {
  return TblLookupPer4LaneBlkShuf(v, kIdx3210);
}

#if HWY_HAVE_FLOAT64
template <class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
    hwy::FloatTag /* type_tag */, hwy::SizeTag<4> /* lane_size_tag */, V v) {
  const DFromV<decltype(v)> d;
  const RepartitionToWide<decltype(d)> dw;
  return BitCast(dw, v);
}
#endif

template <size_t kLaneSize, class V>
HWY_INLINE VFromD<RepartitionToWide<RebindToUnsigned<DFromV<V>>>>
Per4LaneBlockShufCastToWide(hwy::FloatTag /* type_tag */,
                            hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const RepartitionToWide<decltype(du)> dw;
  return BitCast(dw, v);
}

template <size_t kLaneSize, class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
    hwy::NonFloatTag /* type_tag */,
    hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
  const DFromV<decltype(v)> d;
  const RepartitionToWide<decltype(d)> dw;
  return BitCast(dw, v);
}

template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x1B> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  return Reverse4(d, v);
}

template <class V,
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
                                        (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  const auto vw = Per4LaneBlockShufCastToWide(
      hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
  return BitCast(d, DupEven(vw));
}

template <class V,
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
                                        (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  const auto vw = Per4LaneBlockShufCastToWide(
      hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
  const DFromV<decltype(vw)> dw;
  return BitCast(d, Reverse2(dw, vw));
}

#if HWY_MAX_BYTES >= 32
template <class V, HWY_IF_T_SIZE_V(V, 8)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
  return SwapAdjacentBlocks(v);
}
#endif

template <class V, HWY_IF_LANES_D(DFromV<V>, 4),
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  return InterleaveLower(d, v, v);
}

template <class V, HWY_IF_T_SIZE_V(V, 4)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  return InterleaveLower(d, v, v);
}

template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  return ConcatEven(d, v, v);
}

template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xA0> /*idx_3210_tag*/, V v) {
  return DupEven(v);
}

template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xB1> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  return Reverse2(d, v);
}

template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  return ConcatOdd(d, v, v);
}

template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xE4> /*idx_3210_tag*/, V v) {
  return v;
}

template <class V,
          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
                                        (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  const auto vw = Per4LaneBlockShufCastToWide(
      hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
  return BitCast(d, DupOdd(vw));
}

template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xF5> /*idx_3210_tag*/, V v) {
  return DupOdd(v);
}

template <class V, HWY_IF_T_SIZE_V(V, 4)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, V v) {
  const DFromV<decltype(v)> d;
  return InterleaveUpper(d, v, v);
}

template <size_t kIdx3210, class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, V v) {
  const DFromV<decltype(v)> d;
  return Per4LaneBlockShuffle(idx_3210_tag, hwy::SizeTag<sizeof(TFromV<V>)>(),
                              hwy::SizeTag<d.MaxBytes()>(), v);
}

}  // namespace detail
#endif  // HWY_TARGET != HWY_SCALAR

template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
          HWY_IF_LANES_D(DFromV<V>, 1)>
HWY_API V Per4LaneBlockShuffle(V v) {
  static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
  static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
  static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
  static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");

  return v;
}

#if HWY_TARGET != HWY_SCALAR
template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
          HWY_IF_LANES_D(DFromV<V>, 2)>
HWY_API V Per4LaneBlockShuffle(V v) {
  static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
  static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
  static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
  static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");

  constexpr bool isReverse2 = (kIdx0 == 1 || kIdx1 == 0) && (kIdx0 != kIdx1);
  constexpr size_t kPer2BlkIdx0 = (kIdx0 <= 1) ? kIdx0 : (isReverse2 ? 1 : 0);
  constexpr size_t kPer2BlkIdx1 = (kIdx1 <= 1) ? kIdx1 : (isReverse2 ? 0 : 1);

  constexpr size_t kIdx10 = (kPer2BlkIdx1 << 1) | kPer2BlkIdx0;
  static_assert(kIdx10 <= 3, "kIdx10 <= 3 must be true");
  return detail::Per2LaneBlockShuffle(hwy::SizeTag<kIdx10>(), v);
}

template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
          HWY_IF_LANES_GT_D(DFromV<V>, 2)>
HWY_API V Per4LaneBlockShuffle(V v) {
  static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
  static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
  static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
  static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");

  constexpr size_t kIdx3210 =
      (kIdx3 << 6) | (kIdx2 << 4) | (kIdx1 << 2) | kIdx0;
  return detail::Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210>(), v);
}
#endif

// ------------------------------ Blocks

template <class D>
HWY_API size_t Blocks(D d) {
  return (d.MaxBytes() <= 16) ? 1 : ((Lanes(d) * sizeof(TFromD<D>) + 15) / 16);
}

// ------------------------------ Block insert/extract/broadcast ops
#if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
#undef HWY_NATIVE_BLK_INSERT_EXTRACT
#else
#define HWY_NATIVE_BLK_INSERT_EXTRACT
#endif

template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
HWY_API V InsertBlock(V /*v*/, V blk_to_insert) {
  static_assert(kBlockIdx == 0, "Invalid block index");
  return blk_to_insert;
}

template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
HWY_API V ExtractBlock(V v) {
  static_assert(kBlockIdx == 0, "Invalid block index");
  return v;
}

template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
HWY_API V BroadcastBlock(V v) {
  static_assert(kBlockIdx == 0, "Invalid block index");
  return v;
}

#endif  // HWY_NATIVE_BLK_INSERT_EXTRACT

// ------------------------------ BroadcastLane
#if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_BROADCASTLANE
#undef HWY_NATIVE_BROADCASTLANE
#else
#define HWY_NATIVE_BROADCASTLANE
#endif

template <int kLane, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
HWY_API V BroadcastLane(V v) {
  return Broadcast<kLane>(v);
}

#endif  // HWY_NATIVE_BROADCASTLANE

// ------------------------------ Slide1Up and Slide1Down
#if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
#undef HWY_NATIVE_SLIDE1_UP_DOWN
#else
#define HWY_NATIVE_SLIDE1_UP_DOWN
#endif

template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> Slide1Up(D d, VFromD<D> /*v*/) {
  return Zero(d);
}
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> Slide1Down(D d, VFromD<D> /*v*/) {
  return Zero(d);
}

#if HWY_TARGET != HWY_SCALAR
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
  return ShiftLeftLanes<1>(d, v);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
  return ShiftRightLanes<1>(d, v);
}
#endif  // HWY_TARGET != HWY_SCALAR

#endif  // HWY_NATIVE_SLIDE1_UP_DOWN

// ------------------------------ SlideUpBlocks

template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> SlideUpBlocks(D /*d*/, VFromD<D> v) {
  static_assert(kBlocks == 0, "kBlocks == 0 must be true");
  return v;
}

#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) {
  static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
                "kBlocks must be between 0 and d.MaxBlocks() - 1");
  constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
  return SlideUpLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
}
#endif

// ------------------------------ SlideDownBlocks

template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> SlideDownBlocks(D /*d*/, VFromD<D> v) {
  static_assert(kBlocks == 0, "kBlocks == 0 must be true");
  return v;
}

#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
  static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
                "kBlocks must be between 0 and d.MaxBlocks() - 1");
  constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
  return SlideDownLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
}
#endif

// ------------------------------ SumsOfAdjQuadAbsDiff

#if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
     defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
#else
#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
#endif

#if HWY_TARGET != HWY_SCALAR
template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
  static_assert(0 <= kAOffset && kAOffset <= 1,
                "kAOffset must be between 0 and 1");
  static_assert(0 <= kBOffset && kBOffset <= 3,
                "kBOffset must be between 0 and 3");
  using D8 = DFromV<V8>;
  const D8 d8;
  const RebindToUnsigned<decltype(d8)> du8;
  const RepartitionToWide<decltype(d8)> d16;
  const RepartitionToWide<decltype(du8)> du16;

  // Ensure that a is resized to a vector that has at least
  // HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and
  // CombineShiftRightBytes operations below.
#if HWY_TARGET == HWY_RVV
  // On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true
  // to ensure that Lanes(d8_interleave) >= 16 is true.

  // Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV
  // targets as d8_interleave.Pow2() >= d8.Pow2() is true.
  constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0);
  const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
#elif HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
    HWY_TARGET == HWY_SVE2_128
  // On SVE targets, Lanes(d8_interleave) >= 16 and
  // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD
  // tag for a full u8/i8 vector on SVE.
  const D8 d8_interleave;
#else
  // On targets that use non-scalable vector types, Lanes(d8_interleave) is
  // equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset).
  constexpr size_t kInterleaveLanes =
      HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset);
  const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave;
#endif

  // The ResizeBitCast operation below will resize a to a vector that has
  // at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the
  // InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations
  // below.
  const auto a_to_interleave = ResizeBitCast(d8_interleave, a);

  const auto a_interleaved_lo =
      InterleaveLower(d8_interleave, a_to_interleave, a_to_interleave);
  const auto a_interleaved_hi =
      InterleaveUpper(d8_interleave, a_to_interleave, a_to_interleave);

  /* a01: { a[kAOffset*4+0], a[kAOffset*4+1], a[kAOffset*4+1], a[kAOffset*4+2],
            a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
            a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
            a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8] }
   */
  /* a23: { a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
            a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
            a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8],
            a[kAOffset*4+8], a[kAOffset*4+9], a[kAOffset*4+9], a[kAOffset*4+10]
     } */

  // a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of
  // the CombineShiftRightBytes are needed for the subsequent AbsDiff operations
  // and as a01 and a23 need to be the same vector type as b01 and b23 for the
  // AbsDiff operations below.
  const V8 a01 =
      ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 1>(
                            d8_interleave, a_interleaved_hi, a_interleaved_lo));
  const V8 a23 =
      ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 5>(
                            d8_interleave, a_interleaved_hi, a_interleaved_lo));

  /* b01: { b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
            b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
            b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
            b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1] }
   */
  /* b23: { b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
            b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
            b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
            b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3] }
   */
  const V8 b01 = BitCast(d8, Broadcast<kBOffset * 2>(BitCast(d16, b)));
  const V8 b23 = BitCast(d8, Broadcast<kBOffset * 2 + 1>(BitCast(d16, b)));

  const VFromD<decltype(du16)> absdiff_sum_01 =
      SumsOf2(BitCast(du8, AbsDiff(a01, b01)));
  const VFromD<decltype(du16)> absdiff_sum_23 =
      SumsOf2(BitCast(du8, AbsDiff(a23, b23)));
  return BitCast(d16, Add(absdiff_sum_01, absdiff_sum_23));
}
#endif  // HWY_TARGET != HWY_SCALAR

#endif  // HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF

// ------------------------------ SumsOfShuffledQuadAbsDiff

#if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \
     defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
#else
#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
#endif

#if HWY_TARGET != HWY_SCALAR
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
          HWY_IF_UI8_D(DFromV<V8>)>
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
                                                                     V8 b) {
  static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
  static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
  static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
  static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");

#if HWY_TARGET == HWY_RVV
  // On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that
  // both vA and vB can be bitcasted to a u32 vector.
  const detail::AdjustSimdTagToMinVecPow2<
      RepartitionToWideX2<DFromV<decltype(a)>>>
      d32;
  const RepartitionToNarrow<decltype(d32)> d16;
  const RepartitionToNarrow<decltype(d16)> d8;

  const auto vA = ResizeBitCast(d8, a);
  const auto vB = ResizeBitCast(d8, b);
#else
  const DFromV<decltype(a)> d8;
  const RepartitionToWide<decltype(d8)> d16;
  const RepartitionToWide<decltype(d16)> d32;

  const auto vA = a;
  const auto vB = b;
#endif

  const RebindToUnsigned<decltype(d8)> du8;

  const auto a_shuf =
      Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(BitCast(d32, vA));
  /* a0123_2345: { a_shuf[0], a_shuf[1], a_shuf[2], a_shuf[3],
                   a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[5],
                   a_shuf[8], a_shuf[9], a_shuf[10], a_shuf[11],
                   a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[13] } */
  /* a1234_3456: { a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[4],
                   a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6],
                   a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12],
                   a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */
#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
  // On RVV/SVE targets, use Slide1Up/Slide1Down instead of
  // ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any
  // lanes that are shifted into an adjacent 16-byte block as any lanes that are
  // shifted into an adjacent 16-byte block by Slide1Up/Slide1Down will be
  // replaced by the OddEven operation.
  const auto a_0123_2345 = BitCast(
      d8, OddEven(BitCast(d32, Slide1Up(d16, BitCast(d16, a_shuf))), a_shuf));
  const auto a_1234_3456 =
      BitCast(d8, OddEven(BitCast(d32, Slide1Up(d8, BitCast(d8, a_shuf))),
                          BitCast(d32, Slide1Down(d8, BitCast(d8, a_shuf)))));
#else
  const auto a_0123_2345 =
      BitCast(d8, OddEven(ShiftLeftBytes<2>(d32, a_shuf), a_shuf));
  const auto a_1234_3456 = BitCast(
      d8,
      OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf)));
#endif

  auto even_sums = SumsOf4(BitCast(du8, AbsDiff(a_0123_2345, vB)));
  auto odd_sums = SumsOf4(BitCast(du8, AbsDiff(a_1234_3456, vB)));

#if HWY_IS_LITTLE_ENDIAN
  odd_sums = ShiftLeft<16>(odd_sums);
#else
  even_sums = ShiftLeft<16>(even_sums);
#endif

  const auto sums = OddEven(BitCast(d16, odd_sums), BitCast(d16, even_sums));

#if HWY_TARGET == HWY_RVV
  return ResizeBitCast(RepartitionToWide<DFromV<V8>>(), sums);
#else
  return sums;
#endif
}
#endif  // HWY_TARGET != HWY_SCALAR

#endif  // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF

// ================================================== Operator wrapper

// SVE* and RVV currently cannot define operators and have already defined
// (only) the corresponding functions such as Add.
#if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
#undef HWY_NATIVE_OPERATOR_REPLACEMENTS
#else
#define HWY_NATIVE_OPERATOR_REPLACEMENTS
#endif

template <class V>
HWY_API V Add(V a, V b) {
  return a + b;
}
template <class V>
HWY_API V Sub(V a, V b) {
  return a - b;
}

template <class V>
HWY_API V Mul(V a, V b) {
  return a * b;
}
template <class V>
HWY_API V Div(V a, V b) {
  return a / b;
}
template <class V>
HWY_API V Mod(V a, V b) {
  return a % b;
}

template <class V>
V Shl(V a, V b) {
  return a << b;
}
template <class V>
V Shr(V a, V b) {
  return a >> b;
}

template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) {
  return a == b;
}
template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) {
  return a != b;
}
template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) {
  return a < b;
}

template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) {
  return a > b;
}
template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) {
  return a >= b;
}

template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) {
  return a <= b;
}

#endif  // HWY_NATIVE_OPERATOR_REPLACEMENTS

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace hwy
HWY_AFTER_NAMESPACE();

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.383 Sekunden (vorverarbeitet am 2026-06-06) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.