/* Returns TRUE if all assumptions met */ static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
{ /* This optimization is based on these assumptions */ /* These assumptions are fundamental and hence assert are */ /* used. Should any assert triggers, we have to re-visit */ /* all related code to make sure it still functions the */ /* same as the C implementation. */
silk_assert(MAX_DEL_DEC_STATES <= 4 &&
MAX_FRAME_LENGTH % 4 == 0 &&
MAX_SUB_FRAME_LENGTH % 4 == 0 &&
LTP_MEM_LENGTH_MS % 4 == 0 );
silk_assert(psEncC->fs_kHz == 8 ||
psEncC->fs_kHz == 12 ||
psEncC->fs_kHz == 16 );
silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR &&
psEncC->nb_subfr > 0 );
silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES &&
psEncC->nStatesDelayedDecision > 0 );
silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS);
/* Regressions were observed on certain AMD Zen CPUs when */ /* nStatesDelayedDecision is 1 or 2. Ideally we should detect */ /* these CPUs and enable this optimization on others; however, */ /* there is no good way to do so under current OPUS framework. */ return psEncC->nStatesDelayedDecision == 3 ||
psEncC->nStatesDelayedDecision == 4;
}
/* Intrinsics not defined on MSVC */ #ifdef _MSC_VER #include <Intsafe.h> staticinlineint __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
{
*res = a+b; return (*res ^ a) & (*res ^ b) & 0x80000000;
} staticinlineint __builtin_ctz(unsignedint x)
{
DWORD res = 0; return _BitScanForward(&res, x) ? res : 32;
} #endif
static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num)
{
num = num > silk_int16_MAX ? silk_int16_MAX : num;
num = num < silk_int16_MIN ? silk_int16_MIN : num; return num;
}
static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits)
{
silk_assert(bits > 0 && bits < 31);
a += 1 << (bits-1); return a >> bits;
}
static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits)
{
silk_assert(bits > 0 && bits < 63); #ifdef OPUS_CHECK_ASM return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits); #else /* This code is more correct, but it won't overflow like the C code in some rare cases. */
silk_assert(bits > 0 && bits < 63);
opus_int64 t = ((opus_int64)a) * ((opus_int64)b);
bits += 16;
t += 1ull << (bits-1); return t >> bits; #endif
}
static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b)
{
opus_int32 sum; if (__builtin_sadd_overflow(a, b, &sum))
{ return a >= 0 ? silk_int32_MAX : silk_int32_MIN;
} return sum;
}
/* (a32 * b32) >> 16 */ static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b)
{ return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16));
}
/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b)
{ return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(silk_LSHIFT(b, 16))));
}
/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */ static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b)
{ constchar FF = (char)0xFF;
__m256i msk = _mm256_set_epi8(
FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0,
FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0);
__m256i lo = _mm256_mullo_epi16(a, b);
__m256i hi = _mm256_mulhi_epi16(a, b);
lo = _mm256_shuffle_epi8(lo, msk);
hi = _mm256_shuffle_epi8(hi, msk); return _mm256_unpacklo_epi16(lo, hi);
}
static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v)
{
v = _mm256_shuffle_epi32(v, 0x1B);
v = _mm256_permute4x64_epi64(v, 0x4E); return v;
}
static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v)
{
__m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0));
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); return _mm_cvtsi128_si32(sum);
}
static __m128i silk_index_to_selector(opus_int32 index)
{
silk_assert(index < 4);
index <<= 2; return _mm_set_epi8(
index + 3, index + 2, index + 1, index + 0,
index + 3, index + 2, index + 1, index + 0,
index + 3, index + 2, index + 1, index + 0,
index + 3, index + 2, index + 1, index + 0);
}
static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ const opus_int16 x16[], /* I Input */
opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
opus_int subfr, /* I Subframe number */ const opus_int LTP_scale_Q14, /* I LTP state scaling */ const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ const opus_int signal_type, /* I Signal type */ const opus_int decisionDelay /* I Decision delay */
);
/*******************************************/ /* LPC analysis filter */ /* NB! State is kept internally and the */ /* filter always starts with zero state */ /* first d output samples are set to zero */ /*******************************************/ static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
opus_int16 *out, /* O Output signal */ const opus_int16 *in, /* I Input signal */ const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ const opus_int32 len, /* I Signal length */ const opus_int32 order /* I Filter order */
);
/******************************************/ /* Noise shape quantizer for one subframe */ /******************************************/ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
opus_int signalType, /* I Signal type */ const opus_int32 x_Q10[], /* I */
opus_int8 pulses[], /* O */
opus_int16 xq[], /* O */
opus_int32 sLTP_Q15[], /* I/O LTP filter state */
opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ const opus_int16 a_Q12[], /* I Short term prediction coefs */ const opus_int16 b_Q14[], /* I Long term prediction coefs */ const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
opus_int lag, /* I Pitch lag */
opus_int32 HarmShapeFIRPacked_Q14, /* I */
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int subfr, /* I Subframe number */
opus_int shapingLPCOrder, /* I Shaping LPC filter order */
opus_int predictLPCOrder, /* I Prediction filter order */
opus_int warping_Q16, /* I */
__m128i MaskDelDec, /* I Mask of states in decision tree */
opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
opus_int decisionDelay /* I */
);
void silk_NSQ_del_dec_avx2( const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */ const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */ const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */ const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */ const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */ const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */ const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{ #ifdef OPUS_CHECK_ASM
silk_nsq_state NSQ_c;
SideInfoIndices psIndices_c;
opus_int8 pulses_c[MAX_FRAME_LENGTH]; const opus_int8 *const pulses_a = pulses;
/* Copy final part of signals from winner state to output and long-term filter states */
psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
last_smple_idx = smpl_buf_idx + decisionDelay;
Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6; for (i = 0; i < decisionDelay; i++)
{
last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
psSample = &psDelDec.Samples[last_smple_idx];
pulses[i - decisionDelay] =
(opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
pxq[i - decisionDelay] =
silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
silk_select_winner(psSample->Shape_Q14, Winner_selector);
} for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
{
NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
} for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
{
NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
}
/* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
out = _mm256_set1_epi32(order >> 1);
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */
if (order == 16)
{
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */
out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */
} return silk_cvtepi64_epi32_high(out);
}
/******************************************/ /* Noise shape quantizer for one subframe */ /******************************************/ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */
opus_int signalType, /* I Signal type */ const opus_int32 x_Q10[], /* I */
opus_int8 pulses[], /* O */
opus_int16 xq[], /* O */
opus_int32 sLTP_Q15[], /* I/O LTP filter state */
opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ const opus_int16 a_Q12[], /* I Short term prediction coefs */ const opus_int16 b_Q14[], /* I Long term prediction coefs */ const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
opus_int lag, /* I Pitch lag */
opus_int32 HarmShapeFIRPacked_Q14, /* I */
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int subfr, /* I Subframe number */
opus_int shapingLPCOrder, /* I Shaping LPC filter order */
opus_int predictLPCOrder, /* I Prediction filter order */
opus_int warping_Q16, /* I */
__m128i MaskDelDec, /* I Mask of states in decision tree */
opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
opus_int decisionDelay /* I */
)
{ int i;
opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2];
opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2];
opus_int32 Gain_Q10 = Gain_Q16 >> 6;
for (i = 0; i < length; i++)
{ /* Perform common calculations used in all states */ /* NSQ_sample_struct */ /* Low 128 bits => 1st set */ /* High 128 bits => 2nd set */ int j;
__m256i SS_Q_Q10;
__m256i SS_RD_Q10;
__m256i SS_xq_Q14;
__m256i SS_LF_AR_Q14;
__m256i SS_Diff_Q14;
__m256i SS_sLTP_shp_Q14;
__m256i SS_LPC_exc_Q14;
__m256i exc_Q14;
__m256i q_Q10, rr_Q10, rd_Q10;
__m256i mask;
__m128i LPC_pred_Q14, n_AR_Q14;
__m128i RDmin_Q10, RDmax_Q10;
__m128i n_LF_Q14;
__m128i r_Q10, q1_Q0, q1_Q10, q2_Q10;
__m128i Winner_rand_state, Winner_selector;
__m128i tmp0, tmp1;
NSQ_del_dec_sample_struct *psLastSample, *psSample;
opus_int32 RDmin_ind, RDmax_ind, last_smple_idx;
opus_int32 LTP_pred_Q14, n_LTP_Q14;
/* find worst in first set */
RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec); /* find best in second set */
RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec);
/* Replace a state if best from second set outperforms worst in first set */
tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10); if (!_mm_test_all_zeros(tmp0, tmp0))
{ int t;
RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0));
RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1));
tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3)));
tmp0 = _mm_blendv_epi8(
_mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0),
silk_index_to_selector(RDmin_ind),
tmp1); for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++)
{
psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0);
}
psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0);
psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0); for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++)
{
psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0);
} for (t = 0; t < DECISION_DELAY; t++)
{
psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0);
psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0);
psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0);
psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0);
psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0);
}
mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1));
SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask);
SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask);
SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask);
SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask);
SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask);
SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask);
SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask);
}
/* Write samples from winner to output and long-term filter states */ if (subfr > 0 || i >= decisionDelay)
{
pulses[i - decisionDelay] =
(opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10);
xq[i - decisionDelay] =
silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8));
NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] =
silk_select_winner(psLastSample->Shape_Q14, Winner_selector);
sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] =
silk_select_winner(psLastSample->Pred_Q15, Winner_selector);
}
NSQ->sLTP_shp_buf_idx++;
NSQ->sLTP_buf_idx++;
static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ const opus_int16 x16[], /* I Input */
opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
opus_int subfr, /* I Subframe number */ const opus_int LTP_scale_Q14, /* I LTP state scaling */ const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ const opus_int signal_type, /* I Signal type */ const opus_int decisionDelay /* I Decision delay */
)
{ int i;
opus_int lag;
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
NSQ_del_dec_sample_struct *psSample;
/* Scale input */
inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5); for (i = 0; i < psEncC->subfr_length; i+=4)
{
__m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
_mm_storeu_si128((__m128i*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
}
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ if (NSQ->rewhite_flag)
{ if (subfr == 0)
{ /* Do LTP downscaling */
inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2);
} for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++)
{
silk_assert(i < MAX_FRAME_LENGTH);
sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]);
}
}
/* Adjust for changing gain */ if (Gains_Q16[subfr] != NSQ->prev_gain_Q16)
{
gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16);
/* Scale long-term shaping state */ for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
{
opus_int32 *p = &NSQ->sLTP_shp_Q14[i];
_mm_storeu_si128((__m128i*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)p), gain_adj_Q16));
}
/* Scale long-term prediction state */ if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0)
{ for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++)
{
sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16;
}
}
/* Scale short-term prediction and shaping states */ for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
{
psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16);
} for (i = 0; i < DECISION_DELAY; i++)
{
psSample = &psDelDec->Samples[i];
psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16);
psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16);
} for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
{
psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16);
}
/* Save inverse gain */
NSQ->prev_gain_Q16 = Gains_Q16[subfr];
}
}
static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
opus_int16 *out, /* O Output signal */ const opus_int16 *in, /* I Input signal */ const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ const opus_int32 len, /* I Signal length */ const opus_int32 order /* I Filter order */
)
{ int i;
opus_int32 out32_Q12, out32;
silk_assert(order == 10 || order == 16);
for(i = order; i < len; i++ )
{ const opus_int16 *in_ptr = &in[ i ]; /* Allowing wrap around so that two wraps can cancel each other. The rare
cases where the result wraps around can only be triggered by invalid streams*/
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.