/* * Copyright (c) 2018 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree.
*/
// There's no pack operation for even and odd, so we need to permute.
*add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
*sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
}
// There's no pack operation for even and odd, so we need to permute.
*add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
*sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
}
// While other architecture combine the load and the stage 1 operations, Power9 // benchmarking show no benefit in such an approach. staticINLINEvoid load(const int16_t *a, int stride, int16x8_t *b) { // Tried out different combinations of load and shift instructions, this is // the fastest one.
{ const int16x8_t l0 = vec_vsx_ld(0, a); const int16x8_t l1 = vec_vsx_ld(0, a + stride); const int16x8_t l2 = vec_vsx_ld(0, a + 2 * stride); const int16x8_t l3 = vec_vsx_ld(0, a + 3 * stride); const int16x8_t l4 = vec_vsx_ld(0, a + 4 * stride); const int16x8_t l5 = vec_vsx_ld(0, a + 5 * stride); const int16x8_t l6 = vec_vsx_ld(0, a + 6 * stride); const int16x8_t l7 = vec_vsx_ld(0, a + 7 * stride);
const int16x8_t l8 = vec_vsx_ld(0, a + 8 * stride); const int16x8_t l9 = vec_vsx_ld(0, a + 9 * stride); const int16x8_t l10 = vec_vsx_ld(0, a + 10 * stride); const int16x8_t l11 = vec_vsx_ld(0, a + 11 * stride); const int16x8_t l12 = vec_vsx_ld(0, a + 12 * stride); const int16x8_t l13 = vec_vsx_ld(0, a + 13 * stride); const int16x8_t l14 = vec_vsx_ld(0, a + 14 * stride); const int16x8_t l15 = vec_vsx_ld(0, a + 15 * stride);
staticINLINEvoid store(tran_low_t *a, const int16x8_t *b) {
vec_vsx_st(b[0], 0, a);
vec_vsx_st(b[8], 0, a + 8);
vec_vsx_st(b[16], 0, a + 16);
vec_vsx_st(b[24], 0, a + 24);
vec_vsx_st(b[1], 0, a + 32);
vec_vsx_st(b[9], 0, a + 40);
vec_vsx_st(b[17], 0, a + 48);
vec_vsx_st(b[25], 0, a + 56);
vec_vsx_st(b[2], 0, a + 64);
vec_vsx_st(b[10], 0, a + 72);
vec_vsx_st(b[18], 0, a + 80);
vec_vsx_st(b[26], 0, a + 88);
vec_vsx_st(b[3], 0, a + 96);
vec_vsx_st(b[11], 0, a + 104);
vec_vsx_st(b[19], 0, a + 112);
vec_vsx_st(b[27], 0, a + 120);
vec_vsx_st(b[4], 0, a + 128);
vec_vsx_st(b[12], 0, a + 136);
vec_vsx_st(b[20], 0, a + 144);
vec_vsx_st(b[28], 0, a + 152);
vec_vsx_st(b[5], 0, a + 160);
vec_vsx_st(b[13], 0, a + 168);
vec_vsx_st(b[21], 0, a + 176);
vec_vsx_st(b[29], 0, a + 184);
vec_vsx_st(b[6], 0, a + 192);
vec_vsx_st(b[14], 0, a + 200);
vec_vsx_st(b[22], 0, a + 208);
vec_vsx_st(b[30], 0, a + 216);
vec_vsx_st(b[7], 0, a + 224);
vec_vsx_st(b[15], 0, a + 232);
vec_vsx_st(b[23], 0, a + 240);
vec_vsx_st(b[31], 0, a + 248);
}
// Returns 1 if negative 0 if positive staticINLINE int16x8_t vec_sign_s16(int16x8_t a) { return vec_sr(a, vec_shift_sign_s16);
}
// Add 2 if positive, 1 if negative, and shift by 2. staticINLINE int16x8_t sub_round_shift(const int16x8_t a) { const int16x8_t sign = vec_sign_s16(a); return vec_sra(vec_sub(vec_add(a, vec_twos_s16), sign), vec_dct_scale_log2);
}
// Add 1 if positive, 2 if negative, and shift by 2. // In practice, add 1, then add the sign bit, then shift without rounding. staticINLINE int16x8_t add_round_shift_s16(const int16x8_t a) { const int16x8_t sign = vec_sign_s16(a); return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2);
}
staticvoid fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) {
int16x8_t temp0[32]; // Hold stages: 1, 4, 7
int16x8_t temp1[32]; // Hold stages: 2, 5
int16x8_t temp2[32]; // Hold stages: 3, 6 int i;
// Stage 1 // Unrolling this loops actually slows down Power9 benchmarks for (i = 0; i < 16; i++) {
temp0[i] = vec_add(in[i], in[31 - i]); // pass through to stage 3.
temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]);
}
// Stage 2 // Unrolling this loops actually slows down Power9 benchmarks for (i = 0; i < 8; i++) {
temp1[i] = vec_add(temp0[i], temp0[15 - i]);
temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]);
}
// Apply butterflies (in place) on pass through to stage 3.
single_butterfly(temp1[27], temp1[20], &temp1[27], &temp1[20]);
single_butterfly(temp1[26], temp1[21], &temp1[26], &temp1[21]);
single_butterfly(temp1[25], temp1[22], &temp1[25], &temp1[22]);
single_butterfly(temp1[24], temp1[23], &temp1[24], &temp1[23]);
// dump the magnitude by 4, hence the intermediate values are within // the range of 16 bits. if (pass) {
temp1[0] = add_round_shift_s16(temp1[0]);
temp1[1] = add_round_shift_s16(temp1[1]);
temp1[2] = add_round_shift_s16(temp1[2]);
temp1[3] = add_round_shift_s16(temp1[3]);
temp1[4] = add_round_shift_s16(temp1[4]);
temp1[5] = add_round_shift_s16(temp1[5]);
temp1[6] = add_round_shift_s16(temp1[6]);
temp1[7] = add_round_shift_s16(temp1[7]);
temp1[8] = add_round_shift_s16(temp1[8]);
temp1[9] = add_round_shift_s16(temp1[9]);
temp1[10] = add_round_shift_s16(temp1[10]);
temp1[11] = add_round_shift_s16(temp1[11]);
temp1[12] = add_round_shift_s16(temp1[12]);
temp1[13] = add_round_shift_s16(temp1[13]);
temp1[14] = add_round_shift_s16(temp1[14]);
temp1[15] = add_round_shift_s16(temp1[15]);
// Generate the top row by munging the first set of 8 from each one // together.
transpose_8x8(&temp1[0], &temp0[0]);
transpose_8x8(&temp2[0], &temp0[8]);
transpose_8x8(&temp3[0], &temp0[16]);
transpose_8x8(&temp4[0], &temp0[24]);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.