/*
* Copyright ( c ) 2022 The WebM project authors . All Rights Reserved .
*
* Use of this source code is governed by a BSD - style license
* that can be found in the LICENSE file in the root of the source
* tree . An additional intellectual property rights grant can be found
* in the file PATENTS . All contributing project authors may
* be found in the AUTHORS file in the root of the source tree .
*/
#include <immintrin.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8 ));
const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4 ));
const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
_mm256_extractf128_si256(t1, 1 ));
return (unsigned int )_mm_cvtsi128_si32(sum);
}
static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
const uint16_t *src, int src_stride,
uint16_t *ref, int ref_stride,
int height) {
int i;
for (i = 0 ; i < height; ++i) {
// load src and all ref[]
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16 ));
const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32 ));
const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48 ));
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16 ));
const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32 ));
const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48 ));
// absolute differences between every ref[] to src
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
// sum every abs diff
*sums_16 =
_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
*sums_16 =
_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
src += src_stride;
ref += ref_stride;
}
}
static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
int n) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
__m256i sums_32 = _mm256_setzero_si256();
int i;
for (i = 0 ; i < (n / 2 ); ++i) {
__m256i sums_16 = _mm256_setzero_si256();
highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2 );
/* sums_16 will outrange after 2 rows, so add current sums_16 to
* sums_32*/
sums_32 = _mm256_add_epi32(
sums_32,
_mm256_add_epi32(
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 ))));
src += src_stride << 1 ;
ref += ref_stride << 1 ;
}
return calc_final(sums_32);
}
#define HIGHBD_SAD64XN(n) \
unsigned int vpx_highbd_sad64x## n## _avx2(const uint8_t *src, int src_stride, \
const uint8_t *ref, \
int ref_stride) { \
return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \
}
#define HIGHBD_SADSKIP64xN(n) \
unsigned int vpx_highbd_sad_skip_64x## n## _avx2( \
const uint8_t *src, int src_stride, const uint8_t *ref, \
int ref_stride) { \
return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
n / 2 ); \
}
static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
const uint16_t *src, int src_stride,
uint16_t *ref, int ref_stride,
int height) {
int i;
for (i = 0 ; i < height; ++i) {
// load src and all ref[]
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16 ));
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16 ));
// absolute differences between every ref[] to src
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
// sum every abs diff
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
src += src_stride;
ref += ref_stride;
}
}
static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
int n) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
__m256i sums_32 = _mm256_setzero_si256();
int i;
for (i = 0 ; i < (n / 8 ); ++i) {
__m256i sums_16 = _mm256_setzero_si256();
highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8 );
/* sums_16 will outrange after 8 rows, so add current sums_16 to
* sums_32*/
sums_32 = _mm256_add_epi32(
sums_32,
_mm256_add_epi32(
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 ))));
src += src_stride << 3 ;
ref += ref_stride << 3 ;
}
return calc_final(sums_32);
}
#define HIGHBD_SAD32XN(n) \
unsigned int vpx_highbd_sad32x## n## _avx2(const uint8_t *src, int src_stride, \
const uint8_t *ref, \
int ref_stride) { \
return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \
}
#define HIGHBD_SADSKIP32xN(n) \
unsigned int vpx_highbd_sad_skip_32x## n## _avx2( \
const uint8_t *src, int src_stride, const uint8_t *ref, \
int ref_stride) { \
return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
n / 2 ); \
}
static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
const uint16_t *src, int src_stride,
uint16_t *ref, int ref_stride,
int height) {
int i;
for (i = 0 ; i < height; i += 2 ) {
// load src and all ref[]
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
// absolute differences between every ref[] to src
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
// sum every abs diff
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
src += src_stride << 1 ;
ref += ref_stride << 1 ;
}
}
static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
int n) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
__m256i sums_32 = _mm256_setzero_si256();
const int height = VPXMIN(16 , n);
const int num_iters = n / height;
int i;
for (i = 0 ; i < num_iters; ++i) {
__m256i sums_16 = _mm256_setzero_si256();
highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
sums_32 = _mm256_add_epi32(
sums_32,
_mm256_add_epi32(
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 ))));
src += src_stride << 4 ;
ref += ref_stride << 4 ;
}
return calc_final(sums_32);
}
#define HIGHBD_SAD16XN(n) \
unsigned int vpx_highbd_sad16x## n## _avx2(const uint8_t *src, int src_stride, \
const uint8_t *ref, \
int ref_stride) { \
return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \
}
#define HIGHBD_SADSKIP16xN(n) \
unsigned int vpx_highbd_sad_skip_16x## n## _avx2( \
const uint8_t *src, int src_stride, const uint8_t *ref, \
int ref_stride) { \
return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
n / 2 ); \
}
unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
__m256i sums_16 = _mm256_setzero_si256();
highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16 );
{
const __m256i sums_32 = _mm256_add_epi32(
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 )));
return calc_final(sums_32);
}
}
unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
__m256i sums_16 = _mm256_setzero_si256();
highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8 );
{
const __m256i sums_32 = _mm256_add_epi32(
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 )));
return calc_final(sums_32);
}
}
// clang-format off
HIGHBD_SAD64XN(64 )
HIGHBD_SADSKIP64xN(64 )
HIGHBD_SAD64XN(32 )
HIGHBD_SADSKIP64xN(32 )
HIGHBD_SAD32XN(64 )
HIGHBD_SADSKIP32xN(64 )
HIGHBD_SAD32XN(32 )
HIGHBD_SADSKIP32xN(32 )
HIGHBD_SAD32XN(16 )
HIGHBD_SADSKIP32xN(16 )
HIGHBD_SAD16XN(32 )
HIGHBD_SADSKIP16xN(32 )
HIGHBD_SADSKIP16xN(16 )
HIGHBD_SADSKIP16xN(8 )
//clang-format on
// AVG -------------------------------------------------------------------------
static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
const uint16_t *src,
int src_stride, uint16_t *ref,
int ref_stride, uint16_t *sec,
int height) {
int i;
for (i = 0 ; i < height; ++i) {
// load src and all ref[]
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16 ));
const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32 ));
const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48 ));
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16 ));
const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32 ));
const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48 ));
const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16 ));
const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32 ));
const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48 ));
const __m256i avg0 = _mm256_avg_epu16(r0, x0);
const __m256i avg1 = _mm256_avg_epu16(r1, x1);
const __m256i avg2 = _mm256_avg_epu16(r2, x2);
const __m256i avg3 = _mm256_avg_epu16(r3, x3);
// absolute differences between every ref/pred avg to src
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
// sum every abs diff
*sums_16 =
_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
*sums_16 =
_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
src += src_stride;
ref += ref_stride;
sec += 64 ;
}
}
#define HIGHBD_SAD64XN_AVG(n) \
unsigned int vpx_highbd_sad64x## n## _avg_avx2( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred) { \
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
__m256i sums_32 = _mm256_setzero_si256(); \
int i; \
\
for (i = 0 ; i < (n / 2 ); ++i) { \
__m256i sums_16 = _mm256_setzero_si256(); \
\
highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2 ); \
\
/* sums_16 will outrange after 2 rows, so add current sums_16 to \
* sums_32*/ \
sums_32 = _mm256_add_epi32( \
sums_32, \
_mm256_add_epi32( \
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 )))); \
\
src += src_stride << 1 ; \
ref += ref_stride << 1 ; \
sec += 64 << 1 ; \
} \
return calc_final(sums_32); \
}
// 64x64
HIGHBD_SAD64XN_AVG(64 )
// 64x32
HIGHBD_SAD64XN_AVG(32 )
static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
const uint16_t *src,
int src_stride, uint16_t *ref,
int ref_stride, uint16_t *sec,
int height) {
int i;
for (i = 0 ; i < height; ++i) {
// load src and all ref[]
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16 ));
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16 ));
const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16 ));
const __m256i avg0 = _mm256_avg_epu16(r0, x0);
const __m256i avg1 = _mm256_avg_epu16(r1, x1);
// absolute differences between every ref/pred avg to src
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
// sum every abs diff
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
src += src_stride;
ref += ref_stride;
sec += 32 ;
}
}
#define HIGHBD_SAD32XN_AVG(n) \
unsigned int vpx_highbd_sad32x## n## _avg_avx2( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred) { \
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
__m256i sums_32 = _mm256_setzero_si256(); \
int i; \
\
for (i = 0 ; i < (n / 8 ); ++i) { \
__m256i sums_16 = _mm256_setzero_si256(); \
\
highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8 ); \
\
/* sums_16 will outrange after 8 rows, so add current sums_16 to \
* sums_32*/ \
sums_32 = _mm256_add_epi32( \
sums_32, \
_mm256_add_epi32( \
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 )))); \
\
src += src_stride << 3 ; \
ref += ref_stride << 3 ; \
sec += 32 << 3 ; \
} \
return calc_final(sums_32); \
}
// 32x64
HIGHBD_SAD32XN_AVG(64 )
// 32x32
HIGHBD_SAD32XN_AVG(32 )
// 32x16
HIGHBD_SAD32XN_AVG(16 )
static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
const uint16_t *src,
int src_stride, uint16_t *ref,
int ref_stride, uint16_t *sec,
int height) {
int i;
for (i = 0 ; i < height; i += 2 ) {
// load src and all ref[]
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16 ));
const __m256i avg0 = _mm256_avg_epu16(r0, x0);
const __m256i avg1 = _mm256_avg_epu16(r1, x1);
// absolute differences between every ref[] to src
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
// sum every abs diff
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
src += src_stride << 1 ;
ref += ref_stride << 1 ;
sec += 32 ;
}
}
unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
const uint8_t *second_pred) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
__m256i sums_32 = _mm256_setzero_si256();
int i;
for (i = 0 ; i < 2 ; ++i) {
__m256i sums_16 = _mm256_setzero_si256();
highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16 );
// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
sums_32 = _mm256_add_epi32(
sums_32,
_mm256_add_epi32(
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 ))));
src += src_stride << 4 ;
ref += ref_stride << 4 ;
sec += 16 << 4 ;
}
return calc_final(sums_32);
}
unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
const uint8_t *second_pred) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
__m256i sums_16 = _mm256_setzero_si256();
highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16 );
{
const __m256i sums_32 = _mm256_add_epi32(
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 )));
return calc_final(sums_32);
}
}
unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const uint8_t *second_pred) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
__m256i sums_16 = _mm256_setzero_si256();
highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8 );
{
const __m256i sums_32 = _mm256_add_epi32(
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1 )));
return calc_final(sums_32);
}
}
Messung V0.5 in Prozent C=94 H=93 G=93
¤ Dauer der Verarbeitung: 0.10 Sekunden
(vorverarbeitet am 2026-06-11)
¤
*© Formatika GbR, Deutschland