/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
#define THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
#include "convolve_avx2.h"
static void convolve_2d_sr_hor_2tap_avx2(
const uint8_t *const src, const int32_t src_stride, const int32_t w,
const int32_t h, const InterpFilterParams *const filter_params_x,
const int32_t subpel_x_q4, int16_t *const im_block) {
const uint8_t *src_ptr = src;
int32_t y = h;
int16_t *im = im_block;
if (w <= 8 ) {
__m128i coeffs_128;
prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
if (w == 2 ) {
do {
const __m128i r =
x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
xy_x_round_store_2x2_sse2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 2 ;
y -= 2 ;
} while (y);
} else if (w == 4 ) {
do {
const __m128i r =
x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
xy_x_round_store_4x2_sse2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 4 ;
y -= 2 ;
} while (y);
} else {
assert(w == 8 );
do {
__m128i r[2 ];
x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
xy_x_round_store_8x2_sse2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 8 ;
y -= 2 ;
} while (y);
}
} else {
__m256i coeffs_256;
prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
if (w == 16 ) {
do {
__m256i r[2 ];
x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
xy_x_round_store_32_avx2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 16 ;
y -= 2 ;
} while (y);
} else if (w == 32 ) {
do {
xy_x_2tap_32_avx2(src_ptr, &coeffs_256, im);
src_ptr += src_stride;
im += 32 ;
} while (--y);
} else if (w == 64 ) {
do {
xy_x_2tap_32_avx2(src_ptr + 0 * 32 , &coeffs_256, im + 0 * 32 );
xy_x_2tap_32_avx2(src_ptr + 1 * 32 , &coeffs_256, im + 1 * 32 );
src_ptr += src_stride;
im += 64 ;
} while (--y);
} else {
assert(w == 128 );
do {
xy_x_2tap_32_avx2(src_ptr + 0 * 32 , &coeffs_256, im + 0 * 32 );
xy_x_2tap_32_avx2(src_ptr + 1 * 32 , &coeffs_256, im + 1 * 32 );
xy_x_2tap_32_avx2(src_ptr + 2 * 32 , &coeffs_256, im + 2 * 32 );
xy_x_2tap_32_avx2(src_ptr + 3 * 32 , &coeffs_256, im + 3 * 32 );
src_ptr += src_stride;
im += 128 ;
} while (--y);
}
}
}
static void convolve_2d_sr_hor_4tap_ssse3(
const uint8_t *const src, const int32_t src_stride, const int32_t w,
const int32_t h, const InterpFilterParams *const filter_params_x,
const int32_t subpel_x_q4, int16_t *const im_block) {
const uint8_t *src_ptr = src - 1 ;
int32_t y = h;
int16_t *im = im_block;
if (w <= 4 ) {
__m128i coeffs_128[2 ];
prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
if (w == 2 ) {
do {
const __m128i r =
x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
xy_x_round_store_2x2_sse2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 2 ;
y -= 2 ;
} while (y);
} else if (w == 4 ) {
do {
const __m128i r =
x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
xy_x_round_store_4x2_sse2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 4 ;
y -= 2 ;
} while (y);
}
} else {
// TODO(chiyotsai@google.com): Add better optimization
__m256i coeffs_256[2 ], filt_256[2 ];
prepare_half_coeffs_4tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
filt_256[0 ] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
filt_256[1 ] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
if (w == 8 ) {
do {
__m256i res =
x_convolve_4tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
xy_x_round_store_8x2_avx2(res, im);
src_ptr += 2 * src_stride;
im += 2 * 8 ;
y -= 2 ;
} while (y);
} else if (w == 16 ) {
do {
__m256i r[2 ];
x_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
xy_x_round_store_32_avx2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 16 ;
y -= 2 ;
} while (y);
} else if (w == 32 ) {
do {
xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
src_ptr += src_stride;
im += 32 ;
} while (--y);
} else if (w == 64 ) {
do {
xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
xy_x_4tap_32_avx2(src_ptr + 32 , coeffs_256, filt_256, im + 32 );
src_ptr += src_stride;
im += 64 ;
} while (--y);
} else {
assert(w == 128 );
do {
xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
xy_x_4tap_32_avx2(src_ptr + 32 , coeffs_256, filt_256, im + 32 );
xy_x_4tap_32_avx2(src_ptr + 64 , coeffs_256, filt_256, im + 64 );
xy_x_4tap_32_avx2(src_ptr + 96 , coeffs_256, filt_256, im + 96 );
src_ptr += src_stride;
im += 128 ;
} while (--y);
}
}
}
static void convolve_2d_sr_hor_6tap_avx2(
const uint8_t *const src, const int32_t src_stride, const int32_t w,
const int32_t h, const InterpFilterParams *const filter_params_x,
const int32_t subpel_x_q4, int16_t *const im_block) {
const uint8_t *src_ptr = src - 2 ;
int32_t y = h;
int16_t *im = im_block;
if (w <= 4 ) {
__m128i coeffs_128[3 ];
prepare_half_coeffs_6tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
if (w == 2 ) {
do {
const __m128i r =
x_convolve_6tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
xy_x_round_store_2x2_sse2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 2 ;
y -= 2 ;
} while (y);
} else if (w == 4 ) {
do {
const __m128i r =
x_convolve_6tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
xy_x_round_store_4x2_sse2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 4 ;
y -= 2 ;
} while (y);
}
} else {
__m256i coeffs_256[3 ], filt_256[3 ];
filt_256[0 ] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
filt_256[1 ] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
filt_256[2 ] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
if (w == 8 ) {
do {
const __m256i res =
x_convolve_6tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
xy_x_round_store_8x2_avx2(res, im);
src_ptr += 2 * src_stride;
im += 2 * 8 ;
y -= 2 ;
} while (y);
} else if (w == 16 ) {
do {
__m256i r[2 ];
x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
xy_x_round_store_32_avx2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 16 ;
y -= 2 ;
} while (y);
} else if (w == 32 ) {
do {
xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
src_ptr += src_stride;
im += 32 ;
} while (--y);
} else if (w == 64 ) {
do {
xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
xy_x_6tap_32_avx2(src_ptr + 32 , coeffs_256, filt_256, im + 32 );
src_ptr += src_stride;
im += 64 ;
} while (--y);
} else {
assert(w == 128 );
do {
xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
xy_x_6tap_32_avx2(src_ptr + 32 , coeffs_256, filt_256, im + 32 );
xy_x_6tap_32_avx2(src_ptr + 64 , coeffs_256, filt_256, im + 64 );
xy_x_6tap_32_avx2(src_ptr + 96 , coeffs_256, filt_256, im + 96 );
src_ptr += src_stride;
im += 128 ;
} while (--y);
}
}
}
static void convolve_2d_sr_hor_8tap_avx2(
const uint8_t *const src, const int32_t src_stride, const int32_t w,
const int32_t h, const InterpFilterParams *const filter_params_x,
const int32_t subpel_x_q4, int16_t *const im_block) {
const uint8_t *src_ptr = src - 3 ;
int32_t y = h;
int16_t *im = im_block;
__m256i coeffs_256[4 ], filt_256[4 ];
filt_256[0 ] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
filt_256[1 ] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
filt_256[2 ] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
filt_256[3 ] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
if (w == 8 ) {
do {
const __m256i res =
x_convolve_8tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
xy_x_round_store_8x2_avx2(res, im);
src_ptr += 2 * src_stride;
im += 2 * 8 ;
y -= 2 ;
} while (y);
} else if (w == 16 ) {
do {
__m256i r[2 ];
x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
xy_x_round_store_32_avx2(r, im);
src_ptr += 2 * src_stride;
im += 2 * 16 ;
y -= 2 ;
} while (y);
} else if (w == 32 ) {
do {
xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
src_ptr += src_stride;
im += 32 ;
} while (--y);
} else if (w == 64 ) {
do {
xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
xy_x_8tap_32_avx2(src_ptr + 32 , coeffs_256, filt_256, im + 32 );
src_ptr += src_stride;
im += 64 ;
} while (--y);
} else {
assert(w == 128 );
do {
xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
xy_x_8tap_32_avx2(src_ptr + 32 , coeffs_256, filt_256, im + 32 );
xy_x_8tap_32_avx2(src_ptr + 64 , coeffs_256, filt_256, im + 64 );
xy_x_8tap_32_avx2(src_ptr + 96 , coeffs_256, filt_256, im + 96 );
src_ptr += src_stride;
im += 128 ;
} while (--y);
}
}
static void convolve_2d_sr_ver_2tap_avx2(
const int16_t *const im_block, const int32_t w, const int32_t h,
const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
uint8_t *dst, const int32_t dst_stride) {
const int16_t *im = im_block;
int32_t y = h;
if (w <= 4 ) {
__m128i coeffs_128;
prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
if (w == 2 ) {
__m128i s_32[2 ];
s_32[0 ] = _mm_cvtsi32_si128(*(int32_t *)im);
do {
const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
xy_y_round_store_2x2_sse2(res, dst, dst_stride);
im += 2 * 2 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
__m128i s_64[2 ], r[2 ];
assert(w == 4 );
s_64[0 ] = _mm_loadl_epi64((__m128i *)im);
do {
xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
r[0 ] = xy_y_round_sse2(r[0 ]);
r[1 ] = xy_y_round_sse2(r[1 ]);
const __m128i rr = _mm_packs_epi32(r[0 ], r[1 ]);
pack_store_4x2_sse2(rr, dst, dst_stride);
im += 2 * 4 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
}
} else {
__m256i coeffs_256;
prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
if (w == 8 ) {
__m128i s_128[2 ];
__m256i r[2 ];
s_128[0 ] = _mm_loadu_si128((__m128i *)im);
do {
xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
xy_y_round_store_8x2_avx2(r, dst, dst_stride);
im += 2 * 8 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 16 ) {
__m256i s_256[2 ], r[4 ];
s_256[0 ] = _mm256_loadu_si256((__m256i *)im);
do {
xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
xy_y_round_store_16x2_avx2(r, dst, dst_stride);
im += 2 * 16 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 32 ) {
__m256i s_256[2 ][2 ];
s_256[0 ][0 ] = _mm256_loadu_si256((__m256i *)(im + 0 * 16 ));
s_256[0 ][1 ] = _mm256_loadu_si256((__m256i *)(im + 1 * 16 ));
do {
xy_y_convolve_2tap_32_all_avx2(im + 32 , s_256[0 ], s_256[1 ], &coeffs_256,
dst);
im += 2 * 32 ;
xy_y_convolve_2tap_32_all_avx2(im, s_256[1 ], s_256[0 ], &coeffs_256,
dst + dst_stride);
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 64 ) {
__m256i s_256[2 ][4 ];
s_256[0 ][0 ] = _mm256_loadu_si256((__m256i *)(im + 0 * 16 ));
s_256[0 ][1 ] = _mm256_loadu_si256((__m256i *)(im + 1 * 16 ));
s_256[0 ][2 ] = _mm256_loadu_si256((__m256i *)(im + 2 * 16 ));
s_256[0 ][3 ] = _mm256_loadu_si256((__m256i *)(im + 3 * 16 ));
do {
xy_y_convolve_2tap_32_all_avx2(im + 64 , s_256[0 ] + 0 , s_256[1 ] + 0 ,
&coeffs_256, dst);
xy_y_convolve_2tap_32_all_avx2(im + 96 , s_256[0 ] + 2 , s_256[1 ] + 2 ,
&coeffs_256, dst + 32 );
im += 2 * 64 ;
xy_y_convolve_2tap_32_all_avx2(im, s_256[1 ] + 0 , s_256[0 ] + 0 ,
&coeffs_256, dst + dst_stride);
xy_y_convolve_2tap_32_all_avx2(im + 32 , s_256[1 ] + 2 , s_256[0 ] + 2 ,
&coeffs_256, dst + dst_stride + 32 );
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
__m256i s_256[2 ][8 ];
assert(w == 128 );
load_16bit_8rows_avx2(im, 16 , s_256[0 ]);
do {
xy_y_convolve_2tap_32_all_avx2(im + 128 , s_256[0 ] + 0 , s_256[1 ] + 0 ,
&coeffs_256, dst);
xy_y_convolve_2tap_32_all_avx2(im + 160 , s_256[0 ] + 2 , s_256[1 ] + 2 ,
&coeffs_256, dst + 1 * 32 );
xy_y_convolve_2tap_32_all_avx2(im + 192 , s_256[0 ] + 4 , s_256[1 ] + 4 ,
&coeffs_256, dst + 2 * 32 );
xy_y_convolve_2tap_32_all_avx2(im + 224 , s_256[0 ] + 6 , s_256[1 ] + 6 ,
&coeffs_256, dst + 3 * 32 );
im += 2 * 128 ;
xy_y_convolve_2tap_32_all_avx2(im, s_256[1 ] + 0 , s_256[0 ] + 0 ,
&coeffs_256, dst + dst_stride);
xy_y_convolve_2tap_32_all_avx2(im + 32 , s_256[1 ] + 2 , s_256[0 ] + 2 ,
&coeffs_256, dst + dst_stride + 1 * 32 );
xy_y_convolve_2tap_32_all_avx2(im + 64 , s_256[1 ] + 4 , s_256[0 ] + 4 ,
&coeffs_256, dst + dst_stride + 2 * 32 );
xy_y_convolve_2tap_32_all_avx2(im + 96 , s_256[1 ] + 6 , s_256[0 ] + 6 ,
&coeffs_256, dst + dst_stride + 3 * 32 );
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
}
}
}
static void convolve_2d_sr_ver_2tap_half_avx2(
const int16_t *const im_block, const int32_t w, const int32_t h,
const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
uint8_t *dst, const int32_t dst_stride) {
const int16_t *im = im_block;
int32_t y = h;
(void )filter_params_y;
(void )subpel_y_q4;
if (w == 2 ) {
__m128i s_32[2 ];
s_32[0 ] = _mm_cvtsi32_si128(*(int32_t *)im);
do {
const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
const __m128i r = xy_y_round_half_pel_sse2(res);
pack_store_2x2_sse2(r, dst, dst_stride);
im += 2 * 2 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 4 ) {
__m128i s_64[2 ];
s_64[0 ] = _mm_loadl_epi64((__m128i *)im);
do {
const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
const __m128i r = xy_y_round_half_pel_sse2(res);
pack_store_4x2_sse2(r, dst, dst_stride);
im += 2 * 4 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 8 ) {
__m128i s_128[2 ];
s_128[0 ] = _mm_loadu_si128((__m128i *)im);
do {
const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
const __m256i r = xy_y_round_half_pel_avx2(res);
pack_store_8x2_avx2(r, dst, dst_stride);
im += 2 * 8 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 16 ) {
__m256i s_256[2 ], r[2 ];
s_256[0 ] = _mm256_loadu_si256((__m256i *)im);
do {
xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
r[0 ] = xy_y_round_half_pel_avx2(r[0 ]);
r[1 ] = xy_y_round_half_pel_avx2(r[1 ]);
xy_y_pack_store_16x2_avx2(r[0 ], r[1 ], dst, dst_stride);
im += 2 * 16 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 32 ) {
__m256i s_256[2 ][2 ];
s_256[0 ][0 ] = _mm256_loadu_si256((__m256i *)(im + 0 * 16 ));
s_256[0 ][1 ] = _mm256_loadu_si256((__m256i *)(im + 1 * 16 ));
do {
xy_y_convolve_2tap_half_pel_32_all_avx2(im + 32 , s_256[0 ], s_256[1 ], dst);
xy_y_convolve_2tap_half_pel_32_all_avx2(im + 2 * 32 , s_256[1 ], s_256[0 ],
dst + dst_stride);
im += 2 * 32 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 64 ) {
__m256i s_256[2 ][4 ];
s_256[0 ][0 ] = _mm256_loadu_si256((__m256i *)(im + 0 * 16 ));
s_256[0 ][1 ] = _mm256_loadu_si256((__m256i *)(im + 1 * 16 ));
s_256[0 ][2 ] = _mm256_loadu_si256((__m256i *)(im + 2 * 16 ));
s_256[0 ][3 ] = _mm256_loadu_si256((__m256i *)(im + 3 * 16 ));
do {
xy_y_convolve_2tap_half_pel_32_all_avx2(im + 64 , s_256[0 ] + 0 ,
s_256[1 ] + 0 , dst);
xy_y_convolve_2tap_half_pel_32_all_avx2(im + 96 , s_256[0 ] + 2 ,
s_256[1 ] + 2 , dst + 32 );
im += 2 * 64 ;
xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1 ] + 0 , s_256[0 ] + 0 ,
dst + dst_stride);
xy_y_convolve_2tap_half_pel_32_all_avx2(
im + 32 , s_256[1 ] + 2 , s_256[0 ] + 2 , dst + dst_stride + 32 );
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
__m256i s_256[2 ][8 ];
assert(w == 128 );
load_16bit_8rows_avx2(im, 16 , s_256[0 ]);
do {
xy_y_convolve_2tap_half_pel_32_all_avx2(im + 128 , s_256[0 ] + 0 ,
s_256[1 ] + 0 , dst);
xy_y_convolve_2tap_half_pel_32_all_avx2(im + 160 , s_256[0 ] + 2 ,
s_256[1 ] + 2 , dst + 1 * 32 );
xy_y_convolve_2tap_half_pel_32_all_avx2(im + 192 , s_256[0 ] + 4 ,
s_256[1 ] + 4 , dst + 2 * 32 );
xy_y_convolve_2tap_half_pel_32_all_avx2(im + 224 , s_256[0 ] + 6 ,
s_256[1 ] + 6 , dst + 3 * 32 );
im += 2 * 128 ;
xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1 ] + 0 , s_256[0 ] + 0 ,
dst + dst_stride);
xy_y_convolve_2tap_half_pel_32_all_avx2(
im + 32 , s_256[1 ] + 2 , s_256[0 ] + 2 , dst + dst_stride + 1 * 32 );
xy_y_convolve_2tap_half_pel_32_all_avx2(
im + 64 , s_256[1 ] + 4 , s_256[0 ] + 4 , dst + dst_stride + 2 * 32 );
xy_y_convolve_2tap_half_pel_32_all_avx2(
im + 96 , s_256[1 ] + 6 , s_256[0 ] + 6 , dst + dst_stride + 3 * 32 );
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
}
}
static void convolve_2d_sr_ver_4tap_avx2(
const int16_t *const im_block, const int32_t w, const int32_t h,
const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
uint8_t *dst, const int32_t dst_stride) {
const int16_t *im = im_block;
int32_t y = h;
if (w == 2 ) {
__m128i coeffs_128[2 ], s_32[4 ], ss_128[2 ];
prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
s_32[0 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2 ));
s_32[1 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2 ));
s_32[2 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2 ));
const __m128i src01 = _mm_unpacklo_epi32(s_32[0 ], s_32[1 ]);
const __m128i src12 = _mm_unpacklo_epi32(s_32[1 ], s_32[2 ]);
ss_128[0 ] = _mm_unpacklo_epi16(src01, src12);
do {
const __m128i res =
xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
xy_y_round_store_2x2_sse2(res, dst, dst_stride);
im += 2 * 2 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
__m256i coeffs_256[2 ];
prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
if (w == 4 ) {
__m128i s_64[4 ];
__m256i s_256[2 ], ss_256[2 ];
s_64[0 ] = _mm_loadl_epi64((__m128i *)(im + 0 * 4 ));
s_64[1 ] = _mm_loadl_epi64((__m128i *)(im + 1 * 4 ));
s_64[2 ] = _mm_loadl_epi64((__m128i *)(im + 2 * 4 ));
// Load lines a and b. Line a to lower 128, line b to upper 128
s_256[0 ] = _mm256_setr_m128i(s_64[0 ], s_64[1 ]);
s_256[1 ] = _mm256_setr_m128i(s_64[1 ], s_64[2 ]);
ss_256[0 ] = _mm256_unpacklo_epi16(s_256[0 ], s_256[1 ]);
do {
const __m256i res =
xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
xy_y_round_store_4x2_avx2(res, dst, dst_stride);
im += 2 * 4 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 8 ) {
__m256i s_256[4 ], r[2 ];
s_256[0 ] = _mm256_loadu_si256((__m256i *)(im + 0 * 8 ));
s_256[1 ] = _mm256_loadu_si256((__m256i *)(im + 1 * 8 ));
if (subpel_y_q4 != 8 ) {
__m256i ss_256[4 ];
ss_256[0 ] = _mm256_unpacklo_epi16(s_256[0 ], s_256[1 ]);
ss_256[2 ] = _mm256_unpackhi_epi16(s_256[0 ], s_256[1 ]);
do {
xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
xy_y_round_store_8x2_avx2(r, dst, dst_stride);
im += 2 * 8 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
do {
xy_y_convolve_4tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
xy_y_round_store_8x2_avx2(r, dst, dst_stride);
im += 2 * 8 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
}
} else if (w == 16 ) {
__m256i s_256[5 ];
s_256[0 ] = _mm256_loadu_si256((__m256i *)(im + 0 * 16 ));
s_256[1 ] = _mm256_loadu_si256((__m256i *)(im + 1 * 16 ));
s_256[2 ] = _mm256_loadu_si256((__m256i *)(im + 2 * 16 ));
if (subpel_y_q4 != 8 ) {
__m256i ss_256[4 ], tt_256[4 ], r[4 ];
ss_256[0 ] = _mm256_unpacklo_epi16(s_256[0 ], s_256[1 ]);
ss_256[2 ] = _mm256_unpackhi_epi16(s_256[0 ], s_256[1 ]);
tt_256[0 ] = _mm256_unpacklo_epi16(s_256[1 ], s_256[2 ]);
tt_256[2 ] = _mm256_unpackhi_epi16(s_256[1 ], s_256[2 ]);
do {
xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256,
r);
xy_y_round_store_16x2_avx2(r, dst, dst_stride);
im += 2 * 16 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
__m256i r[4 ];
do {
xy_y_convolve_4tap_16x2_half_pelavx2(im, s_256, coeffs_256, r);
xy_y_round_store_16x2_avx2(r, dst, dst_stride);
im += 2 * 16 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
}
} else {
/*It's a special condition for OBMC. A/c to Av1 spec 4-tap won't
support for width(w)>16, but for OBMC while predicting above block
it reduces size block to Wx(h/2), for example, if above block size
is 32x8, we get block size as 32x4 for OBMC.*/
int32_t x = 0 ;
assert(!(w % 32 ));
__m256i s_256[2 ][4 ], ss_256[2 ][4 ], tt_256[2 ][4 ], r0[4 ], r1[4 ];
do {
const int16_t *s = im + x;
uint8_t *d = dst + x;
loadu_unpack_16bit_3rows_avx2(s, w, s_256[0 ], ss_256[0 ], tt_256[0 ]);
loadu_unpack_16bit_3rows_avx2(s + 16 , w, s_256[1 ], ss_256[1 ],
tt_256[1 ]);
y = h;
do {
xy_y_convolve_4tap_32x2_avx2(s, w, s_256[0 ], ss_256[0 ], tt_256[0 ],
coeffs_256, r0);
xy_y_convolve_4tap_32x2_avx2(s + 16 , w, s_256[1 ], ss_256[1 ],
tt_256[1 ], coeffs_256, r1);
xy_y_round_store_32_avx2(r0 + 0 , r1 + 0 , d);
xy_y_round_store_32_avx2(r0 + 2 , r1 + 2 , d + dst_stride);
s += 2 * w;
d += 2 * dst_stride;
y -= 2 ;
} while (y);
x += 32 ;
} while (x < w);
}
}
}
static void convolve_2d_sr_ver_6tap_avx2(
const int16_t *const im_block, const int32_t w, const int32_t h,
const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
uint8_t *dst, const int32_t dst_stride) {
const int16_t *im = im_block;
int32_t y;
if (w == 2 ) {
__m128i coeffs_128[3 ], s_32[6 ], ss_128[3 ];
prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
s_32[0 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2 ));
s_32[1 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2 ));
s_32[2 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2 ));
s_32[3 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2 ));
s_32[4 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2 ));
const __m128i src01 = _mm_unpacklo_epi32(s_32[0 ], s_32[1 ]);
const __m128i src12 = _mm_unpacklo_epi32(s_32[1 ], s_32[2 ]);
const __m128i src23 = _mm_unpacklo_epi32(s_32[2 ], s_32[3 ]);
const __m128i src34 = _mm_unpacklo_epi32(s_32[3 ], s_32[4 ]);
ss_128[0 ] = _mm_unpacklo_epi16(src01, src12);
ss_128[1 ] = _mm_unpacklo_epi16(src23, src34);
y = h;
do {
const __m128i res =
xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
xy_y_round_store_2x2_sse2(res, dst, dst_stride);
im += 2 * 2 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
__m256i coeffs_256[3 ];
prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
if (w == 4 ) {
__m128i s_64[6 ];
__m256i s_256[6 ], ss_256[3 ];
s_64[0 ] = _mm_loadl_epi64((__m128i *)(im + 0 * 4 ));
s_64[1 ] = _mm_loadl_epi64((__m128i *)(im + 1 * 4 ));
s_64[2 ] = _mm_loadl_epi64((__m128i *)(im + 2 * 4 ));
s_64[3 ] = _mm_loadl_epi64((__m128i *)(im + 3 * 4 ));
s_64[4 ] = _mm_loadl_epi64((__m128i *)(im + 4 * 4 ));
// Load lines a and b. Line a to lower 128, line b to upper 128
s_256[0 ] = _mm256_setr_m128i(s_64[0 ], s_64[1 ]);
s_256[1 ] = _mm256_setr_m128i(s_64[1 ], s_64[2 ]);
s_256[2 ] = _mm256_setr_m128i(s_64[2 ], s_64[3 ]);
s_256[3 ] = _mm256_setr_m128i(s_64[3 ], s_64[4 ]);
ss_256[0 ] = _mm256_unpacklo_epi16(s_256[0 ], s_256[1 ]);
ss_256[1 ] = _mm256_unpacklo_epi16(s_256[2 ], s_256[3 ]);
y = h;
do {
const __m256i res =
xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
xy_y_round_store_4x2_avx2(res, dst, dst_stride);
im += 2 * 4 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 8 ) {
__m256i s_256[6 ], r[2 ];
s_256[0 ] = _mm256_loadu_si256((__m256i *)(im + 0 * 8 ));
s_256[1 ] = _mm256_loadu_si256((__m256i *)(im + 1 * 8 ));
s_256[2 ] = _mm256_loadu_si256((__m256i *)(im + 2 * 8 ));
s_256[3 ] = _mm256_loadu_si256((__m256i *)(im + 3 * 8 ));
y = h;
if (subpel_y_q4 != 8 ) {
__m256i ss_256[6 ];
ss_256[0 ] = _mm256_unpacklo_epi16(s_256[0 ], s_256[1 ]);
ss_256[1 ] = _mm256_unpacklo_epi16(s_256[2 ], s_256[3 ]);
ss_256[3 ] = _mm256_unpackhi_epi16(s_256[0 ], s_256[1 ]);
ss_256[4 ] = _mm256_unpackhi_epi16(s_256[2 ], s_256[3 ]);
do {
xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
xy_y_round_store_8x2_avx2(r, dst, dst_stride);
im += 2 * 8 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
do {
xy_y_convolve_6tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
xy_y_round_store_8x2_avx2(r, dst, dst_stride);
im += 2 * 8 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
}
} else if (w == 16 ) {
__m256i s_256[6 ];
s_256[0 ] = _mm256_loadu_si256((__m256i *)(im + 0 * 16 ));
s_256[1 ] = _mm256_loadu_si256((__m256i *)(im + 1 * 16 ));
s_256[2 ] = _mm256_loadu_si256((__m256i *)(im + 2 * 16 ));
s_256[3 ] = _mm256_loadu_si256((__m256i *)(im + 3 * 16 ));
s_256[4 ] = _mm256_loadu_si256((__m256i *)(im + 4 * 16 ));
y = h;
if (subpel_y_q4 != 8 ) {
__m256i ss_256[6 ], tt_256[6 ], r[4 ];
ss_256[0 ] = _mm256_unpacklo_epi16(s_256[0 ], s_256[1 ]);
ss_256[1 ] = _mm256_unpacklo_epi16(s_256[2 ], s_256[3 ]);
ss_256[3 ] = _mm256_unpackhi_epi16(s_256[0 ], s_256[1 ]);
ss_256[4 ] = _mm256_unpackhi_epi16(s_256[2 ], s_256[3 ]);
tt_256[0 ] = _mm256_unpacklo_epi16(s_256[1 ], s_256[2 ]);
tt_256[1 ] = _mm256_unpacklo_epi16(s_256[3 ], s_256[4 ]);
tt_256[3 ] = _mm256_unpackhi_epi16(s_256[1 ], s_256[2 ]);
tt_256[4 ] = _mm256_unpackhi_epi16(s_256[3 ], s_256[4 ]);
do {
xy_y_convolve_6tap_16x2_avx2(im, 16 , s_256, ss_256, tt_256,
coeffs_256, r);
xy_y_round_store_16x2_avx2(r, dst, dst_stride);
im += 2 * 16 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
__m256i ss_256[4 ], r[4 ];
do {
xy_y_convolve_6tap_16x2_half_pel_avx2(im, 16 , s_256, ss_256,
coeffs_256, r);
xy_y_round_store_16x2_avx2(r, dst, dst_stride);
im += 2 * 16 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
}
} else {
int32_t x = 0 ;
assert(!(w % 32 ));
__m256i s_256[2 ][6 ], ss_256[2 ][6 ], tt_256[2 ][6 ], r0[4 ], r1[4 ];
do {
const int16_t *s = im + x;
uint8_t *d = dst + x;
loadu_unpack_16bit_5rows_avx2(s, w, s_256[0 ], ss_256[0 ], tt_256[0 ]);
loadu_unpack_16bit_5rows_avx2(s + 16 , w, s_256[1 ], ss_256[1 ],
tt_256[1 ]);
y = h;
do {
xy_y_convolve_6tap_16x2_avx2(s, w, s_256[0 ], ss_256[0 ], tt_256[0 ],
coeffs_256, r0);
xy_y_convolve_6tap_16x2_avx2(s + 16 , w, s_256[1 ], ss_256[1 ],
tt_256[1 ], coeffs_256, r1);
xy_y_round_store_32_avx2(r0 + 0 , r1 + 0 , d);
xy_y_round_store_32_avx2(r0 + 2 , r1 + 2 , d + dst_stride);
s += 2 * w;
d += 2 * dst_stride;
y -= 2 ;
} while (y);
x += 32 ;
} while (x < w);
}
}
}
static void convolve_2d_sr_ver_8tap_avx2(
const int16_t *const im_block, const int32_t w, const int32_t h,
const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
uint8_t *dst, const int32_t dst_stride) {
const int16_t *im = im_block;
int32_t y;
if (w == 2 ) {
__m128i coeffs_128[4 ], s_32[8 ], ss_128[4 ];
prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
s_32[0 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2 ));
s_32[1 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2 ));
s_32[2 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2 ));
s_32[3 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2 ));
s_32[4 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2 ));
s_32[5 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2 ));
s_32[6 ] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2 ));
const __m128i src01 = _mm_unpacklo_epi32(s_32[0 ], s_32[1 ]);
const __m128i src12 = _mm_unpacklo_epi32(s_32[1 ], s_32[2 ]);
const __m128i src23 = _mm_unpacklo_epi32(s_32[2 ], s_32[3 ]);
const __m128i src34 = _mm_unpacklo_epi32(s_32[3 ], s_32[4 ]);
const __m128i src45 = _mm_unpacklo_epi32(s_32[4 ], s_32[5 ]);
const __m128i src56 = _mm_unpacklo_epi32(s_32[5 ], s_32[6 ]);
ss_128[0 ] = _mm_unpacklo_epi16(src01, src12);
ss_128[1 ] = _mm_unpacklo_epi16(src23, src34);
ss_128[2 ] = _mm_unpacklo_epi16(src45, src56);
y = h;
do {
const __m128i res =
xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
xy_y_round_store_2x2_sse2(res, dst, dst_stride);
im += 2 * 2 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
__m256i coeffs_256[4 ];
prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
if (w == 4 ) {
__m128i s_64[8 ];
__m256i s_256[8 ], ss_256[4 ];
s_64[0 ] = _mm_loadl_epi64((__m128i *)(im + 0 * 4 ));
s_64[1 ] = _mm_loadl_epi64((__m128i *)(im + 1 * 4 ));
s_64[2 ] = _mm_loadl_epi64((__m128i *)(im + 2 * 4 ));
s_64[3 ] = _mm_loadl_epi64((__m128i *)(im + 3 * 4 ));
s_64[4 ] = _mm_loadl_epi64((__m128i *)(im + 4 * 4 ));
s_64[5 ] = _mm_loadl_epi64((__m128i *)(im + 5 * 4 ));
s_64[6 ] = _mm_loadl_epi64((__m128i *)(im + 6 * 4 ));
// Load lines a and b. Line a to lower 128, line b to upper 128
s_256[0 ] = _mm256_setr_m128i(s_64[0 ], s_64[1 ]);
s_256[1 ] = _mm256_setr_m128i(s_64[1 ], s_64[2 ]);
s_256[2 ] = _mm256_setr_m128i(s_64[2 ], s_64[3 ]);
s_256[3 ] = _mm256_setr_m128i(s_64[3 ], s_64[4 ]);
s_256[4 ] = _mm256_setr_m128i(s_64[4 ], s_64[5 ]);
s_256[5 ] = _mm256_setr_m128i(s_64[5 ], s_64[6 ]);
ss_256[0 ] = _mm256_unpacklo_epi16(s_256[0 ], s_256[1 ]);
ss_256[1 ] = _mm256_unpacklo_epi16(s_256[2 ], s_256[3 ]);
ss_256[2 ] = _mm256_unpacklo_epi16(s_256[4 ], s_256[5 ]);
y = h;
do {
const __m256i res =
xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
xy_y_round_store_4x2_avx2(res, dst, dst_stride);
im += 2 * 4 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else if (w == 8 ) {
__m256i s_256[8 ], r[2 ];
s_256[0 ] = _mm256_loadu_si256((__m256i *)(im + 0 * 8 ));
s_256[1 ] = _mm256_loadu_si256((__m256i *)(im + 1 * 8 ));
s_256[2 ] = _mm256_loadu_si256((__m256i *)(im + 2 * 8 ));
s_256[3 ] = _mm256_loadu_si256((__m256i *)(im + 3 * 8 ));
s_256[4 ] = _mm256_loadu_si256((__m256i *)(im + 4 * 8 ));
s_256[5 ] = _mm256_loadu_si256((__m256i *)(im + 5 * 8 ));
y = h;
if (subpel_y_q4 != 8 ) {
__m256i ss_256[8 ];
convolve_8tap_unpack_avx2(s_256, ss_256);
do {
xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
xy_y_round_store_8x2_avx2(r, dst, dst_stride);
im += 2 * 8 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
do {
xy_y_convolve_8tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
xy_y_round_store_8x2_avx2(r, dst, dst_stride);
im += 2 * 8 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
}
} else if (w == 16 ) {
__m256i s_256[8 ], r[4 ];
load_16bit_7rows_avx2(im, 16 , s_256);
y = h;
if (subpel_y_q4 != 8 ) {
__m256i ss_256[8 ], tt_256[8 ];
convolve_8tap_unpack_avx2(s_256, ss_256);
convolve_8tap_unpack_avx2(s_256 + 1 , tt_256);
do {
xy_y_convolve_8tap_16x2_avx2(im, 16 , coeffs_256, s_256, ss_256,
tt_256, r);
xy_y_round_store_16x2_avx2(r, dst, dst_stride);
im += 2 * 16 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
} else {
do {
xy_y_convolve_8tap_16x2_half_pel_avx2(im, 16 , coeffs_256, s_256, r);
xy_y_round_store_16x2_avx2(r, dst, dst_stride);
im += 2 * 16 ;
dst += 2 * dst_stride;
y -= 2 ;
} while (y);
}
} else {
int32_t x = 0 ;
__m256i s_256[2 ][8 ], r0[4 ], r1[4 ];
assert(!(w % 32 ));
__m256i ss_256[2 ][8 ], tt_256[2 ][8 ];
do {
const int16_t *s = im + x;
uint8_t *d = dst + x;
load_16bit_7rows_avx2(s, w, s_256[0 ]);
convolve_8tap_unpack_avx2(s_256[0 ], ss_256[0 ]);
convolve_8tap_unpack_avx2(s_256[0 ] + 1 , tt_256[0 ]);
load_16bit_7rows_avx2(s + 16 , w, s_256[1 ]);
convolve_8tap_unpack_avx2(s_256[1 ], ss_256[1 ]);
convolve_8tap_unpack_avx2(s_256[1 ] + 1 , tt_256[1 ]);
y = h;
do {
xy_y_convolve_8tap_16x2_avx2(s, w, coeffs_256, s_256[0 ], ss_256[0 ],
tt_256[0 ], r0);
xy_y_convolve_8tap_16x2_avx2(s + 16 , w, coeffs_256, s_256[1 ],
ss_256[1 ], tt_256[1 ], r1);
xy_y_round_store_32_avx2(r0 + 0 , r1 + 0 , d);
xy_y_round_store_32_avx2(r0 + 2 , r1 + 2 , d + dst_stride);
s += 2 * w;
d += 2 * dst_stride;
y -= 2 ;
} while (y);
x += 32 ;
} while (x < w);
}
}
}
typedef void (*Convolve2dSrHorTapFunc)(
const uint8_t *const src, const int32_t src_stride, const int32_t w,
const int32_t h, const InterpFilterParams *const filter_params_x,
const int32_t subpel_x_q4, int16_t *const im_block);
typedef void (*Convolve2dSrVerTapFunc)(
const int16_t *const im_block, const int32_t w, const int32_t h,
const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
uint8_t *dst, const int32_t dst_stride);
static AOM_FORCE_INLINE void av1_convolve_2d_sr_specialized_avx2(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
const int32_t subpel_y_q4, ConvolveParams *conv_params) {
static const Convolve2dSrHorTapFunc
convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1 ] = {
NULL,
NULL,
convolve_2d_sr_hor_2tap_avx2,
NULL,
convolve_2d_sr_hor_4tap_ssse3,
NULL,
convolve_2d_sr_hor_6tap_avx2,
NULL,
convolve_2d_sr_hor_8tap_avx2
};
static const Convolve2dSrVerTapFunc
convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1 ] = {
NULL,
convolve_2d_sr_ver_2tap_half_avx2,
convolve_2d_sr_ver_2tap_avx2,
convolve_2d_sr_ver_4tap_avx2,
convolve_2d_sr_ver_4tap_avx2,
convolve_2d_sr_ver_6tap_avx2,
convolve_2d_sr_ver_6tap_avx2,
convolve_2d_sr_ver_8tap_avx2,
convolve_2d_sr_ver_8tap_avx2
};
const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
assert(tap_x != 12 && tap_y != 12 );
const uint8_t *src_ptr = src - ((tap_y >> 1 ) - 1 ) * src_stride;
// Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
// permutation.
DECLARE_ALIGNED(32 , int16_t,
im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
(void )conv_params;
assert(conv_params->round_0 == 3 );
assert(conv_params->round_1 == 11 );
// horizontal filter
int32_t hh = h + tap_y;
assert(!(hh % 2 ));
convolve_2d_sr_hor_tap_func_table[tap_x](
src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
// vertical filter
convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8 )](
im_block, w, h, filter_params_y, subpel_y_q4, dst, dst_stride);
}
#endif // THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
Messung V0.5 in Prozent C=91 H=63 G=77
¤ Dauer der Verarbeitung: 0.24 Sekunden
(vorverarbeitet am 2026-06-07)
¤
*© Formatika GbR, Deutschland