// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* Proper support for integers is only provided by AVX2. In the meantime, we'll use SSE instructions and packets to deal with integers. template<> struct packet_traits<int> : default_packet_traits { typedef Packet8i type; enum { Vectorizable = 1, AlignedOnScalar = 1, size=8 }; };
*/
// Helper function for bit packing snippet of low precision comparison. // It packs the flags from 16x16 to 8x16.
EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) { return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
_mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
}
template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(constfloat& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(constdouble& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) { #ifdef EIGEN_VECTORIZE_AVX2 return _mm256_add_epi32(a,b); #else
__m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
__m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); #endif
}
template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) { #ifdef EIGEN_VECTORIZE_AVX2 return _mm256_sub_epi32(a,b); #else
__m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
__m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); #endif
}
template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
{ return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
} template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
{ return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
}
template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) { #ifdef EIGEN_VECTORIZE_AVX2 return _mm256_mullo_epi32(a,b); #else const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); #endif
}
template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
{ eigen_assert(false && "packet integer division are not supported by AVX"); return pset1<Packet8i>(0);
}
#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, // and even register spilling with clang>=6.0 (bug 1637). // Gcc stupidly generates a vfmadd132ps instruction. // So let's enforce it to generate a vfmadd231ps instruction since the most common use // case is to accumulate the result of the product.
Packet8f res = c;
__asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); return res; #else return _mm256_fmadd_ps(a,b,c); #endif
} template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // see above
Packet4d res = c;
__asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); return res; #else return _mm256_fmadd_pd(a,b,c); #endif
} #endif
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may flip // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
Packet8f res; asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); return res; #else // Arguments are swapped to match NaN propagation behavior of std::min. return _mm256_min_ps(b,a); #endif
} template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // See pmin above
Packet4d res; asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); return res; #else // Arguments are swapped to match NaN propagation behavior of std::min. return _mm256_min_pd(b,a); #endif
}
template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // See pmin above
Packet8f res; asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); return res; #else // Arguments are swapped to match NaN propagation behavior of std::max. return _mm256_max_ps(b,a); #endif
} template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // See pmin above
Packet4d res; asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); return res; #else // Arguments are swapped to match NaN propagation behavior of std::max. return _mm256_max_pd(b,a); #endif
}
// Add specializations for min/max with prescribed NaN progation. template<>
EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) { return pminmax_propagate_numbers(a, b, pmin<Packet8f>);
} template<>
EIGEN_STRONG_INLINE Packet4d pmin<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) { return pminmax_propagate_numbers(a, b, pmin<Packet4d>);
} template<>
EIGEN_STRONG_INLINE Packet8f pmax<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) { return pminmax_propagate_numbers(a, b, pmax<Packet8f>);
} template<>
EIGEN_STRONG_INLINE Packet4d pmax<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) { return pminmax_propagate_numbers(a, b, pmax<Packet4d>);
} template<>
EIGEN_STRONG_INLINE Packet8f pmin<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) { return pminmax_propagate_nan(a, b, pmin<Packet8f>);
} template<>
EIGEN_STRONG_INLINE Packet4d pmin<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) { return pminmax_propagate_nan(a, b, pmin<Packet4d>);
} template<>
EIGEN_STRONG_INLINE Packet8f pmax<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) { return pminmax_propagate_nan(a, b, pmax<Packet8f>);
} template<>
EIGEN_STRONG_INLINE Packet4d pmax<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) { return pminmax_propagate_nan(a, b, pmax<Packet4d>);
}
template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) { #ifdef EIGEN_VECTORIZE_AVX2 // vpcmpeqd has lower latency than the more general vcmpps return _mm256_cmpeq_epi32(a,a); #else const __m256 b = _mm256_castsi256_ps(a); return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ)); #endif
}
template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) { #ifdef EIGEN_VECTORIZE_AVX2 // vpcmpeqd has lower latency than the more general vcmpps const __m256i b = _mm256_castps_si256(a); return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b)); #else return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ); #endif
}
template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) { #ifdef EIGEN_VECTORIZE_AVX2 // vpcmpeqq has lower latency than the more general vcmppd const __m256i b = _mm256_castpd_si256(a); return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b)); #else return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ); #endif
}
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) { #ifdef EIGEN_VECTORIZE_AVX2 return _mm256_and_si256(a,b); #else return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); #endif
}
// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(constfloat* from)
{ // TODO try to find a way to avoid the need of a temporary register // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); // return _mm256_unpacklo_ps(tmp,tmp);
// _mm256_insertf128_ps is very slow on Haswell, thus:
Packet8f tmp = _mm256_broadcast_ps((const __m128*)(constvoid*)from); // mimic an "inplace" permutation of the lower 128bits using a blend
tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); // then we can perform a consistent permutation on the global register to get everything in shape: return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
} // Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(constdouble* from)
{
Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(constvoid*)from); return _mm256_permute_pd(tmp, 3<<2);
}
// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(constfloat* from, Index stride)
{ return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
} template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(constdouble* from, Index stride)
{ return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, constfloat& a)
{
Packet8f pa = pset1<Packet8f>(a);
pstore(to, pa);
} template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, constdouble& a)
{
Packet4d pa = pset1<Packet4d>(a);
pstore(to, pa);
} template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, constint& a)
{
Packet8i pa = pset1<Packet8i>(a);
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) { return _mm_cvtss_f32(_mm256_castps256_ps128(a));
} template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) { return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
} template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
}
template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
{
__m256 tmp = _mm256_shuffle_ps(a,a,0x1b); return _mm256_permute2f128_ps(tmp, tmp, 1);
} template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
{
__m256d tmp = _mm256_shuffle_pd(a,a,5); return _mm256_permute2f128_pd(tmp, tmp, 1); #if 0 // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1); return _mm256_permute_pd(swap_halves,5); #endif
}
// pabs should be ok template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
{ const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); return _mm256_and_ps(a,mask);
} template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
{ const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); return _mm256_and_pd(a,mask);
}
template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: return _mm_or_si128(a,b);
} template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { return _mm_xor_si128(a,b);
} template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { return _mm_and_si128(a,b);
} template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { return _mm_andnot_si128(b,a);
}
template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) { return _mm_blendv_epi8(b, a, mask);
}
template<> EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) { return float2half(pround<Packet8f>(half2float(a)));
}
template<> EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) { return float2half(print<Packet8f>(half2float(a)));
}
template<> EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) { return float2half(pceil<Packet8f>(half2float(a)));
}
template<> EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) { return float2half(pfloor<Packet8f>(half2float(a)));
}
template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a); float reduced = predux<Packet8f>(af); return Eigen::half(reduced);
}
template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a); float reduced = predux_max<Packet8f>(af); return Eigen::half(reduced);
}
template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a); float reduced = predux_min<Packet8f>(af); return Eigen::half(reduced);
}
template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a); float reduced = predux_mul<Packet8f>(af); return Eigen::half(reduced);
}
template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
{
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); return _mm_shuffle_epi8(a,m);
}
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet8h,8>& kernel) {
__m128i a = kernel.packet[0];
__m128i b = kernel.packet[1];
__m128i c = kernel.packet[2];
__m128i d = kernel.packet[3];
__m128i e = kernel.packet[4];
__m128i f = kernel.packet[5];
__m128i g = kernel.packet[6];
__m128i h = kernel.packet[7];
// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
Packet8bf r;
__m256i input = _mm256_castps_si256(a);
#ifdef EIGEN_VECTORIZE_AVX2 // uint32_t lsb = (input >> 16);
__m256i t = _mm256_srli_epi32(input, 16); // uint32_t lsb = lsb & 1;
t = _mm256_and_si256(t, _mm256_set1_epi32(1)); // uint32_t rounding_bias = 0x7fff + lsb;
t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff)); // input += rounding_bias;
t = _mm256_add_epi32(t, input); // input = input >> 16;
t = _mm256_srli_epi32(t, 16); // Check NaN before converting back to bf16
__m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
__m256i nan = _mm256_set1_epi32(0x7fc0);
t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask)); // output = numext::bit_cast<uint16_t>(input); return _mm_packus_epi32(_mm256_extractf128_si256(t, 0),
_mm256_extractf128_si256(t, 1)); #else // uint32_t lsb = (input >> 16);
__m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
__m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16); // uint32_t lsb = lsb & 1;
lo = _mm_and_si128(lo, _mm_set1_epi32(1));
hi = _mm_and_si128(hi, _mm_set1_epi32(1)); // uint32_t rounding_bias = 0x7fff + lsb;
lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff));
hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff)); // input += rounding_bias;
lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0));
hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1)); // input = input >> 16;
lo = _mm_srli_epi32(lo, 16);
hi = _mm_srli_epi32(hi, 16); // Check NaN before converting back to bf16
__m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
__m128i nan = _mm_set1_epi32(0x7fc0);
lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1))); // output = numext::bit_cast<uint16_t>(input); return _mm_packus_epi32(lo, hi); #endif
}
template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) { returnstatic_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
}
template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) { returnstatic_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
}
template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) { returnstatic_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
}
template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) { returnstatic_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
}
template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
{
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); return _mm_shuffle_epi8(a,m);
}
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet8bf,8>& kernel) {
__m128i a = kernel.packet[0];
__m128i b = kernel.packet[1];
__m128i c = kernel.packet[2];
__m128i d = kernel.packet[3];
__m128i e = kernel.packet[4];
__m128i f = kernel.packet[5];
__m128i g = kernel.packet[6];
__m128i h = kernel.packet[7];
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.