// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull> class gebp_traits;
/** \internal \returns b if a<=0, and returns a otherwise. */ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
{ return a<=0 ? b : a;
}
#ifdefined(EIGEN_DEFAULT_L1_CACHE_SIZE) #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE #else #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val #endif// defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
#ifdefined(EIGEN_DEFAULT_L2_CACHE_SIZE) #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE #else #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val #endif// defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
#ifdefined(EIGEN_DEFAULT_L3_CACHE_SIZE) #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE #else #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val #endif// defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
if(action==SetAction)
{ // set the cpu cache size and cache all block sizes from a global cache size in byte
eigen_internal_assert(l1!=0 && l2!=0);
m_cacheSizes.m_l1 = *l1;
m_cacheSizes.m_l2 = *l2;
m_cacheSizes.m_l3 = *l3;
} elseif(action==GetAction)
{
eigen_internal_assert(l1!=0 && l2!=0);
*l1 = m_cacheSizes.m_l1;
*l2 = m_cacheSizes.m_l2;
*l3 = m_cacheSizes.m_l3;
} else
{
eigen_internal_assert(false);
}
}
/* Helper for computeProductBlockingSizes. * * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, * this function computes the blocking size parameters along the respective dimensions * for matrix products and related algorithms. The blocking sizes depends on various * parameters: * - the L1 and L2 cache sizes, * - the register level blocking sizes defined by gebp_traits, * - the number of scalars that fit into a packet (when vectorization is enabled). *
* \sa setCpuCacheSizes */
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
{ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
// Explanations: // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed // per mr x kc horizontal small panels where mr is the blocking size along the m dimension // at the register level. This small horizontal panel has to stay within L1 cache.
std::ptrdiff_t l1, l2, l3;
manage_caching_sizes(GetAction, &l1, &l2, &l3); #ifdef EIGEN_VECTORIZE_AVX512 // We need to find a rationale for that, but without this adjustment, // performance with AVX512 is pretty bad, like -20% slower. // One reason is that with increasing packet-size, the blocking size k // has to become pretty small if we want that 1 lhs panel fit within L1. // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. // This is quite small for a good reuse of the accumulation registers.
l1 *= 4; #endif
if (num_threads > 1) { typedeftypename Traits::ResScalar ResScalar; enum {
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
kr = 8,
mr = Traits::mr,
nr = Traits::nr
}; // Increasing k gives us more time to prefetch the content of the "C" // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). // To avoid that k vanishes, we make k_cache at least as big as kr const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320)); if (k_cache < k) {
k = k_cache - (k_cache % kr);
eigen_internal_assert(k > 0);
}
const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); const Index n_per_thread = numext::div_ceil(n, num_threads); if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache.
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
n = n_cache - (n_cache % nr);
eigen_internal_assert(n > 0);
} else {
n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
}
if (l3 > l2) { // l3 is shared between all cores, so we'll give each thread its own chunk of l3. const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); const Index m_per_thread = numext::div_ceil(m, num_threads); if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
m = m_cache - (m_cache % mr);
eigen_internal_assert(m > 0);
} else {
m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
}
}
} else { // In unit tests we do not want to use extra large matrices, // so we reduce the cache size to check the blocking strategy is not flawed #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
l1 = 9*1024;
l2 = 32*1024;
l3 = 512*1024; #endif
// Early return for small problems because the computation below are time consuming for small problems. // Perhaps it would make more sense to consider k*n*m?? // Note that for very tiny problem, this function should be bypassed anyway // because we use the coefficient-based implementation for them. if((numext::maxi)(k,(numext::maxi)(m,n))<48) return;
// ---- 1st level of blocking on L1, yields kc ----
// Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. // We also include a register-level block of the result (mx x nr). // (In an ideal world only the lhs panel would stay in L1) // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); const Index old_k = k; if(k>max_kc)
{ // We are really blocking on the third dimension: // -> reduce blocking size to make sure the last block is as large as possible // while keeping the same number of sweeps over the result.
k = (k%max_kc)==0 ? max_kc
: max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
}
// ---- 2nd level of blocking on max(L2,L3), yields nc ----
// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: // actual_l2 = max(l2, l3/nb_core_sharing_l3) // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) // For instance, it corresponds to 6MB of L3 shared among 4 cores. #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS const Index actual_l2 = l3; #else const Index actual_l2 = 1572864; // == 1.5 MB #endif
// Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. // The second half is implicitly reserved to access the result and lhs coefficients. // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful // to limit this growth: we bound nc to growth by a factor x1.5. // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all, // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
Index max_nc; const Index lhs_bytes = m * k * sizeof(LhsScalar); const Index remaining_l1 = l1- k_sub - lhs_bytes; if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
{ // L1 blocking
max_nc = remaining_l1 / (k*sizeof(RhsScalar));
} else
{ // L2 blocking
max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
} // WARNING Below, we assume that Traits::nr is a power of two.
Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); if(n>nc)
{ // We are really blocking over the columns: // -> reduce blocking size to make sure the last block is as large as possible // while keeping the same number of sweeps over the packed lhs. // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
n = (n%nc)==0 ? nc
: (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
} elseif(old_k==k)
{ // So far, no blocking at all, i.e., kc==k, and nc==n. // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
Index problem_size = k*n*sizeof(LhsScalar);
Index actual_lm = actual_l2;
Index max_mc = m; if(problem_size<=1024)
{ // problem is small enough to keep in L1 // Let's choose m such that lhs's block fit in 1/3 of L1
actual_lm = l1;
} elseif(l3!=0 && problem_size<=32768)
{ // we have both L2 and L3, and problem is small enough to be kept in L2 // Let's choose m such that lhs's block fit in 1/3 of L2
actual_lm = l2;
max_mc = (numext::mini<Index>)(576,max_mc);
}
Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); if (mc > Traits::mr) mc -= mc % Traits::mr; elseif (mc==0) return;
m = (m%mc)==0 ? mc
: (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
}
}
}
template <typename Index> inlinebool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
{ #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); returntrue;
} #else
EIGEN_UNUSED_VARIABLE(k)
EIGEN_UNUSED_VARIABLE(m)
EIGEN_UNUSED_VARIABLE(n) #endif returnfalse;
}
/** \brief Computes the blocking parameters for a m x k times k x n matrix product * * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. * * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, * this function computes the blocking size parameters along the respective dimensions * for matrix products and related algorithms. * * The blocking size parameters may be evaluated: * - either by a heuristic based on cache sizes; * - or using fixed prescribed values (for testing purposes). *
* \sa setCpuCacheSizes */
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
{ if (!useSpecificBlockingSizes(k, m, n)) {
evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
}
}
template<typename LhsScalar, typename RhsScalar, typename Index> inlinevoid computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
{
computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
}
/* Vectorization logic * real*real: unpack rhs to constant packets, ... * * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), * storing each res packet into two packets (2x2), * at the end combine them: swap the second and addsub them * cf*cf : same but with 2x4 blocks * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
*/ template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize> class gebp_traits
{ public: typedef _LhsScalar LhsScalar; typedef _RhsScalar RhsScalar; typedeftypename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
// register block size along the N direction must be 1 or 4
nr = 4,
// register block size along the M direction (currently, this one cannot be modified)
default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, #ifdefined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
&& ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) // we assume 16 registers or more // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. // Bug 1515: MSVC prior to v19.14 yields to register spilling.
mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else
mr = default_mr, #endif
template<typename LhsPacketType>
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
{
dest = pload<LhsPacketType>(a);
}
template<typename LhsPacketType>
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
{
dest = ploadu<LhsPacketType>(a);
}
template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
{
conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we // let gcc allocate the register in which to store the result of the pmul // (in the case where there is no FMA) gcc fails to figure out how to avoid // spilling register. #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE(tmp);
c = cj.pmadd(a,b,c); #else
tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp); #endif
}
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
{ // FIXME we can do better! // what we want here is a ploadheight
RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
dest = ploadquad<RhsPacket>(tmp);
}
// note that for DoublePacket<RealPacket> the "4" in "downto4" // corresponds to the number of complexes, so it means "8" // it terms of real coefficients.
// nothing special here
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{
dest = pload<LhsPacket>((consttypename unpacket_traits<LhsPacket>::type*)(a));
}
template<typename LhsPacketType>
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
{
dest = ploadu<LhsPacketType>((consttypename unpacket_traits<LhsPacketType>::type*)(a));
}
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters
nr = 4,
mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
EIGEN_DONT_INLINE voidoperator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
EIGEN_STRONG_INLINE voidoperator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
ResScalar alpha, SAccPacket &C0)
{ typedeftypename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter; typedeftypename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter; typedeftypename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter; typedeftypename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
if (depth - endk > 0)
{ // We have to handle the last row(s) of the rhs, which // correspond to a half-packet
SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
EIGEN_STRONG_INLINE voidoperator()( const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
{
GEBPTraits traits;
// loops on each largest micro horizontal panel of lhs // (LhsProgress x depth) for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
{ // loops on each largest micro vertical panel of rhs (depth * nr) for(Index j2=0; j2<packet_cols4; j2+=nr)
{ // We select a LhsProgress x nr micro block of res // which is entirely stored into 1 x nr registers.
const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
prefetch(&blA[0]);
// gets res block as register
AccPacket C0, C1, C2, C3;
traits.initAcc(C0);
traits.initAcc(C1);
traits.initAcc(C2);
traits.initAcc(C3); // To improve instruction pipelining, let's double the accumulation registers: // even k will accumulate in C*, while odd k will accumulate in D*. // This trick is crutial to get good performance with FMA, otherwise it is // actually faster to perform separated MUL+ADD because of a naturally // better instruction-level parallelism.
AccPacket D0, D1, D2, D3;
traits.initAcc(D0);
traits.initAcc(D1);
traits.initAcc(D2);
traits.initAcc(D3);
// Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++)
{ // One column at a time const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
prefetch(&blA[0]);
// gets res block as register
AccPacket C0;
traits.initAcc(C0);
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, intmr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
Traits traits;
SwappedTraits straits;
if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth;
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); constint prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1;
//---------- Process 3 * LhsProgress rows at once ---------- // This corresponds to 3*LhsProgress x nr register blocks. // Usually, make sense only with FMA if(mr>=3*Traits::LhsProgress)
{ // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth) // and on each largest micro vertical panel of the rhs (depth * nr). // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1. // However, if depth is too small, we can extend the number of rows of these horizontal panels. // This actual number of rows is computed as follow: const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function. // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess), // or because we are testing specific blocking sizes. const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) )); for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
{ const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3); for(Index j2=0; j2<packet_cols4; j2+=nr)
{ for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
{
// We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely // stored into 3 x nr registers.
const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
prefetch(&blA[0]);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.