/*
* Copyright (c) 2022 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
do { \
__m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
\
DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \
_t0 = __lsx_vilvl_h(_s1, _s0); \
_t1 = __lsx_vilvh_h(_s1, _s0); \
_t2 = __lsx_vilvl_h(_s3, _s2); \
_t3 = __lsx_vilvh_h(_s3, _s2); \
DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
} while (0 )
#if !CONFIG_VP9_HIGHBITDEPTH
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
int32_t src_stride) {
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
__m128i in8, in9, in10, in11, in12, in13, in14, in15;
__m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
__m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
__m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
__m128i coeff = { 0 x187e3b21d2bf2d41, 0 x238e3537e782c4df };
__m128i coeff1 = { 0 x289a317906463fb1, 0 x12943d3f1e2b3871 };
__m128i coeff2 = { 0 xed6cd766c78fc04f, 0 x0 };
int32_t src_stride2 = src_stride << 1 ;
int32_t src_stride4 = src_stride2 << 1 ;
int32_t src_stride6 = src_stride4 + src_stride2;
int32_t src_stride8 = src_stride4 << 1 ;
int16_t *input_tmp = (int16_t *)input;
in0 = __lsx_vld(input_tmp, 0 );
DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
input_tmp += src_stride4;
DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
input_tmp += src_stride4;
DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
in12);
input_tmp += src_stride4;
DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
in14);
input_tmp += src_stride2;
in15 = __lsx_vldx(input_tmp, src_stride2);
DUP4_ARG2(__lsx_vslli_h, in0, 2 , in1, 2 , in2, 2 , in3, 2 , in0, in1, in2, in3);
DUP4_ARG2(__lsx_vslli_h, in4, 2 , in5, 2 , in6, 2 , in7, 2 , in4, in5, in6, in7);
DUP4_ARG2(__lsx_vslli_h, in8, 2 , in9, 2 , in10, 2 , in11, 2 , in8, in9, in10,
in11);
DUP4_ARG2(__lsx_vslli_h, in12, 2 , in13, 2 , in14, 2 , in15, 2 , in12, in13, in14,
in15);
DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
tmp1, tmp2, tmp3);
DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
tmp6, tmp7);
FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
__lsx_vst(tmp0, tmp_ptr, 0 );
__lsx_vst(tmp1, tmp_ptr, 64 );
__lsx_vst(tmp2, tmp_ptr, 128 );
__lsx_vst(tmp3, tmp_ptr, 192 );
__lsx_vst(tmp4, tmp_ptr, 256 );
__lsx_vst(tmp5, tmp_ptr, 320 );
__lsx_vst(tmp6, tmp_ptr, 384 );
__lsx_vst(tmp7, tmp_ptr, 448 );
DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
in14, in13, in12);
DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
in9, in8);
tmp_ptr += 16 ;
/* stp 1 */
DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);
cnst4 = __lsx_vreplvei_h(coeff, 0 );
DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);
cnst5 = __lsx_vreplvei_h(coeff, 1 );
cnst5 = __lsx_vpackev_h(cnst5, cnst4);
DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);
/* stp2 */
LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
DUP2_ARG2(__lsx_vreplvei_h, coeff, 2 , coeff, 3 , cnst0, cnst1);
cnst0 = __lsx_vpackev_h(cnst0, cnst1);
DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);
cnst0 = __lsx_vreplvei_h(coeff, 4 );
cnst1 = __lsx_vpackev_h(cnst1, cnst0);
DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);
LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
vec1 = __lsx_vilvl_h(in15, in8);
vec0 = __lsx_vilvh_h(in15, in8);
DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0 , coeff1, 1 , cnst0, cnst1);
cnst0 = __lsx_vpackev_h(cnst0, cnst1);
DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
__lsx_vst(in8, tmp_ptr, 0 );
cnst0 = __lsx_vreplvei_h(coeff2, 0 );
cnst0 = __lsx_vpackev_h(cnst1, cnst0);
DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
__lsx_vst(in8, tmp_ptr, 448 );
vec1 = __lsx_vilvl_h(in14, in9);
vec0 = __lsx_vilvh_h(in14, in9);
DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2 , coeff1, 3 , cnst0, cnst1);
cnst1 = __lsx_vpackev_h(cnst1, cnst0);
DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
__lsx_vst(in8, tmp_ptr, 256 );
cnst1 = __lsx_vreplvei_h(coeff2, 2 );
cnst0 = __lsx_vpackev_h(cnst0, cnst1);
DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
__lsx_vst(in8, tmp_ptr, 192 );
DUP2_ARG2(__lsx_vreplvei_h, coeff, 2 , coeff, 5 , cnst0, cnst1);
cnst1 = __lsx_vpackev_h(cnst1, cnst0);
DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);
cnst1 = __lsx_vreplvei_h(coeff, 3 );
cnst1 = __lsx_vpackev_h(cnst0, cnst1);
DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);
/* stp4 */
DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);
vec1 = __lsx_vilvl_h(in13, in10);
vec0 = __lsx_vilvh_h(in13, in10);
DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4 , coeff1, 5 , cnst0, cnst1);
cnst0 = __lsx_vpackev_h(cnst0, cnst1);
DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
__lsx_vst(in8, tmp_ptr, 128 );
cnst0 = __lsx_vreplvei_h(coeff2, 1 );
cnst0 = __lsx_vpackev_h(cnst1, cnst0);
DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
__lsx_vst(in8, tmp_ptr, 320 );
DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
vec1 = __lsx_vilvl_h(in12, in11);
vec0 = __lsx_vilvh_h(in12, in11);
DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6 , coeff1, 7 , cnst0, cnst1);
cnst1 = __lsx_vpackev_h(cnst1, cnst0);
DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
__lsx_vst(in8, tmp_ptr, 384 );
cnst1 = __lsx_vreplvei_h(coeff2, 3 );
cnst0 = __lsx_vpackev_h(cnst0, cnst1);
DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
__lsx_vst(in8, tmp_ptr, 64 );
}
void fdct16x8_1d_row(int16_t *input, int16_t *output) {
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
__m128i in8, in9, in10, in11, in12, in13, in14, in15;
int16_t *input_tmp = input;
DUP4_ARG2(__lsx_vld, input, 0 , input, 32 , input, 64 , input, 96 , in0, in1, in2,
in3);
DUP4_ARG2(__lsx_vld, input, 128 , input, 160 , input, 192 , input, 224 , in4, in5,
in6, in7);
DUP4_ARG2(__lsx_vld, input_tmp, 16 , input_tmp, 48 , input_tmp, 80 , input_tmp,
112 , in8, in9, in10, in11);
DUP4_ARG2(__lsx_vld, input_tmp, 144 , input_tmp, 176 , input_tmp, 208 ,
input_tmp, 240 , in12, in13, in14, in15);
LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
in10, in11, in12, in13, in14, in15);
DUP4_ARG2(__lsx_vaddi_hu, in0, 1 , in1, 1 , in2, 1 , in3, 1 , in0, in1, in2, in3);
DUP4_ARG2(__lsx_vaddi_hu, in4, 1 , in5, 1 , in6, 1 , in7, 1 , in4, in5, in6, in7);
DUP4_ARG2(__lsx_vaddi_hu, in8, 1 , in9, 1 , in10, 1 , in11, 1 , in8, in9, in10,
in11);
DUP4_ARG2(__lsx_vaddi_hu, in12, 1 , in13, 1 , in14, 1 , in15, 1 , in12, in13,
in14, in15);
DUP4_ARG2(__lsx_vsrai_h, in0, 2 , in1, 2 , in2, 2 , in3, 2 , in0, in1, in2, in3);
DUP4_ARG2(__lsx_vsrai_h, in4, 2 , in5, 2 , in6, 2 , in7, 2 , in4, in5, in6, in7);
DUP4_ARG2(__lsx_vsrai_h, in8, 2 , in9, 2 , in10, 2 , in11, 2 , in8, in9, in10,
in11);
DUP4_ARG2(__lsx_vsrai_h, in12, 2 , in13, 2 , in14, 2 , in15, 2 , in12, in13, in14,
in15);
LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
in15);
__lsx_vst(in8, input, 0 );
__lsx_vst(in9, input, 32 );
__lsx_vst(in10, input, 64 );
__lsx_vst(in11, input, 96 );
__lsx_vst(in12, input, 128 );
__lsx_vst(in13, input, 160 );
__lsx_vst(in14, input, 192 );
__lsx_vst(in15, input, 224 );
FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
DUP4_ARG2(__lsx_vld, input, 0 , input, 32 , input, 64 , input, 96 , in8, in9,
in10, in11);
DUP4_ARG2(__lsx_vld, input, 128 , input, 160 , input, 192 , input, 224 , in12,
in13, in14, in15);
FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
in4, in5, in6, in7);
LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
tmp1, in1, tmp2, in2, tmp3, in3);
__lsx_vst(tmp0, output, 0 );
__lsx_vst(in0, output, 32 );
__lsx_vst(tmp1, output, 64 );
__lsx_vst(in1, output, 96 );
__lsx_vst(tmp2, output, 128 );
__lsx_vst(in2, output, 160 );
__lsx_vst(tmp3, output, 192 );
__lsx_vst(in3, output, 224 );
LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
tmp5, in5, tmp6, in6, tmp7, in7);
__lsx_vst(tmp4, output, 16 );
__lsx_vst(in4, output, 48 );
__lsx_vst(tmp5, output, 80 );
__lsx_vst(in5, output, 112 );
__lsx_vst(tmp6, output, 144 );
__lsx_vst(in6, output, 176 );
__lsx_vst(tmp7, output, 208 );
__lsx_vst(in7, output, 240 );
}
void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
int32_t src_stride) {
__m128i in0, in1, in2, in3;
int32_t src_stride2 = src_stride << 1 ;
int32_t src_stride4 = src_stride2 << 1 ;
int32_t src_stride6 = src_stride4 + src_stride2;
in0 = __lsx_vld(input, 0 );
DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
in3 = __lsx_vldx(input, src_stride6);
/* fdct4 pre-process */
{
__m128i vec, mask;
__m128i zero = __lsx_vldi(0 );
mask = __lsx_vinsgr2vr_b(zero, 1 , 0 );
DUP4_ARG2(__lsx_vslli_h, in0, 4 , in1, 4 , in2, 4 , in3, 4 , in0, in1, in2,
in3);
vec = __lsx_vseqi_h(in0, 0 );
vec = __lsx_vxori_b(vec, 255 );
vec = __lsx_vand_v(mask, vec);
in0 = __lsx_vadd_h(in0, vec);
}
VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
DUP4_ARG2(__lsx_vaddi_hu, in0, 1 , in1, 1 , in2, 1 , in3, 1 , in0, in1, in2, in3);
DUP4_ARG2(__lsx_vsrai_h, in0, 2 , in1, 2 , in2, 2 , in3, 2 , in0, in1, in2, in3);
DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
__lsx_vst(in0, output, 0 );
__lsx_vst(in2, output, 16 );
}
void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
int32_t src_stride) {
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
int32_t src_stride2 = src_stride << 1 ;
int32_t src_stride4 = src_stride2 << 1 ;
int32_t src_stride6 = src_stride4 + src_stride2;
int16_t *input_tmp = (int16_t *)input;
in0 = __lsx_vld(input_tmp, 0 );
DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
in2);
in3 = __lsx_vldx(input_tmp, src_stride6);
input_tmp += src_stride4;
in4 = __lsx_vld(input_tmp, 0 );
DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
in6);
in7 = __lsx_vldx(input_tmp, src_stride6);
DUP4_ARG2(__lsx_vslli_h, in0, 2 , in1, 2 , in2, 2 , in3, 2 , in0, in1, in2, in3);
DUP4_ARG2(__lsx_vslli_h, in4, 2 , in5, 2 , in6, 2 , in7, 2 , in4, in5, in6, in7);
VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
in5, in6, in7);
LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
in5, in6, in7);
LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
__lsx_vst(in0, output, 0 );
__lsx_vst(in1, output, 16 );
__lsx_vst(in2, output, 32 );
__lsx_vst(in3, output, 48 );
__lsx_vst(in4, output, 64 );
__lsx_vst(in5, output, 80 );
__lsx_vst(in6, output, 96 );
__lsx_vst(in7, output, 112 );
}
void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
int32_t src_stride) {
int32_t i;
DECLARE_ALIGNED(32 , int16_t, tmp_buf[16 * 16 ]);
/* column transform */
for (i = 0 ; i < 2 ; ++i) {
fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0 ] + 8 * i), src_stride);
}
/* row transform */
for (i = 0 ; i < 2 ; ++i) {
fdct16x8_1d_row((&tmp_buf[0 ] + (128 * i)), (output + (128 * i)));
}
}
#endif // !CONFIG_VP9_HIGHBITDEPTH
Messung V0.5 in Prozent C=91 H=91 G=90