/*
* Copyright (c) 2023, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include <stdint.h>
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "av1/encoder/arm/pickrst_neon.h"
#include "av1/encoder/pickrst.h"
static inline void highbd_calc_proj_params_r0_r1_neon(
const uint8_t *src8, int width, int height, int src_stride,
const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
int32_t *flt1, int flt1_stride, int64_t H[2 ][2 ], int64_t C[2 ]) {
assert(width % 8 == 0 );
const int size = width * height;
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
int64x2_t h00_lo = vdupq_n_s64(0 );
int64x2_t h00_hi = vdupq_n_s64(0 );
int64x2_t h11_lo = vdupq_n_s64(0 );
int64x2_t h11_hi = vdupq_n_s64(0 );
int64x2_t h01_lo = vdupq_n_s64(0 );
int64x2_t h01_hi = vdupq_n_s64(0 );
int64x2_t c0_lo = vdupq_n_s64(0 );
int64x2_t c0_hi = vdupq_n_s64(0 );
int64x2_t c1_lo = vdupq_n_s64(0 );
int64x2_t c1_hi = vdupq_n_s64(0 );
do {
const uint16_t *src_ptr = src;
const uint16_t *dat_ptr = dat;
int32_t *flt0_ptr = flt0;
int32_t *flt1_ptr = flt1;
int w = width;
do {
uint16x8_t s = vld1q_u16(src_ptr);
uint16x8_t d = vld1q_u16(dat_ptr);
int32x4_t f0_lo = vld1q_s32(flt0_ptr);
int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4 );
int32x4_t f1_lo = vld1q_s32(flt1_ptr);
int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4 );
int32x4_t u_lo =
vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
int32x4_t u_hi = vreinterpretq_s32_u32(
vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
int32x4_t s_lo =
vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
int32x4_t s_hi = vreinterpretq_s32_u32(
vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
s_lo = vsubq_s32(s_lo, u_lo);
s_hi = vsubq_s32(s_hi, u_hi);
f0_lo = vsubq_s32(f0_lo, u_lo);
f0_hi = vsubq_s32(f0_hi, u_hi);
f1_lo = vsubq_s32(f1_lo, u_lo);
f1_hi = vsubq_s32(f1_hi, u_hi);
h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
src_ptr += 8 ;
dat_ptr += 8 ;
flt0_ptr += 8 ;
flt1_ptr += 8 ;
w -= 8 ;
} while (w != 0 );
src += src_stride;
dat += dat_stride;
flt0 += flt0_stride;
flt1 += flt1_stride;
} while (--height != 0 );
H[0 ][0 ] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
H[0 ][1 ] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
H[1 ][1 ] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
H[1 ][0 ] = H[0 ][1 ];
C[0 ] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
C[1 ] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
}
static inline void highbd_calc_proj_params_r0_neon(
const uint8_t *src8, int width, int height, int src_stride,
const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
int64_t H[2 ][2 ], int64_t C[2 ]) {
assert(width % 8 == 0 );
const int size = width * height;
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
int64x2_t h00_lo = vdupq_n_s64(0 );
int64x2_t h00_hi = vdupq_n_s64(0 );
int64x2_t c0_lo = vdupq_n_s64(0 );
int64x2_t c0_hi = vdupq_n_s64(0 );
do {
const uint16_t *src_ptr = src;
const uint16_t *dat_ptr = dat;
int32_t *flt0_ptr = flt0;
int w = width;
do {
uint16x8_t s = vld1q_u16(src_ptr);
uint16x8_t d = vld1q_u16(dat_ptr);
int32x4_t f0_lo = vld1q_s32(flt0_ptr);
int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4 );
int32x4_t u_lo =
vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
int32x4_t u_hi = vreinterpretq_s32_u32(
vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
int32x4_t s_lo =
vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
int32x4_t s_hi = vreinterpretq_s32_u32(
vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
s_lo = vsubq_s32(s_lo, u_lo);
s_hi = vsubq_s32(s_hi, u_hi);
f0_lo = vsubq_s32(f0_lo, u_lo);
f0_hi = vsubq_s32(f0_hi, u_hi);
h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
src_ptr += 8 ;
dat_ptr += 8 ;
flt0_ptr += 8 ;
w -= 8 ;
} while (w != 0 );
src += src_stride;
dat += dat_stride;
flt0 += flt0_stride;
} while (--height != 0 );
H[0 ][0 ] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
C[0 ] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
}
static inline void highbd_calc_proj_params_r1_neon(
const uint8_t *src8, int width, int height, int src_stride,
const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
int64_t H[2 ][2 ], int64_t C[2 ]) {
assert(width % 8 == 0 );
const int size = width * height;
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
int64x2_t h11_lo = vdupq_n_s64(0 );
int64x2_t h11_hi = vdupq_n_s64(0 );
int64x2_t c1_lo = vdupq_n_s64(0 );
int64x2_t c1_hi = vdupq_n_s64(0 );
do {
const uint16_t *src_ptr = src;
const uint16_t *dat_ptr = dat;
int32_t *flt1_ptr = flt1;
int w = width;
do {
uint16x8_t s = vld1q_u16(src_ptr);
uint16x8_t d = vld1q_u16(dat_ptr);
int32x4_t f1_lo = vld1q_s32(flt1_ptr);
int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4 );
int32x4_t u_lo =
vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
int32x4_t u_hi = vreinterpretq_s32_u32(
vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
int32x4_t s_lo =
vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
int32x4_t s_hi = vreinterpretq_s32_u32(
vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
s_lo = vsubq_s32(s_lo, u_lo);
s_hi = vsubq_s32(s_hi, u_hi);
f1_lo = vsubq_s32(f1_lo, u_lo);
f1_hi = vsubq_s32(f1_hi, u_hi);
h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
src_ptr += 8 ;
dat_ptr += 8 ;
flt1_ptr += 8 ;
w -= 8 ;
} while (w != 0 );
src += src_stride;
dat += dat_stride;
flt1 += flt1_stride;
} while (--height != 0 );
H[1 ][1 ] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
C[1 ] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
}
// The function calls 3 subfunctions for the following cases :
// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
// of C and H need to be computed.
// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
// non-zero and need to be computed.
// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
// non-zero and need to be computed.
void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width,
int height, int src_stride,
const uint8_t *dat8, int dat_stride,
int32_t *flt0, int flt0_stride,
int32_t *flt1, int flt1_stride,
int64_t H[2 ][2 ], int64_t C[2 ],
const sgr_params_type *params) {
if ((params->r[0 ] > 0 ) && (params->r[1 ] > 0 )) {
highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
dat_stride, flt0, flt0_stride, flt1,
flt1_stride, H, C);
} else if (params->r[0 ] > 0 ) {
highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8,
dat_stride, flt0, flt0_stride, H, C);
} else if (params->r[1 ] > 0 ) {
highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8,
dat_stride, flt1, flt1_stride, H, C);
}
}
static inline void hadd_update_4_stats_neon(const int64_t *const src,
const int32x4_t *deltas,
int64_t *const dst) {
int64x2_t delta0_s64 = vpaddlq_s32(deltas[0 ]);
int64x2_t delta1_s64 = vpaddlq_s32(deltas[1 ]);
int64x2_t delta2_s64 = vpaddlq_s32(deltas[2 ]);
int64x2_t delta3_s64 = vpaddlq_s32(deltas[3 ]);
#if AOM_ARCH_AARCH64
int64x2_t delta01 = vpaddq_s64(delta0_s64, delta1_s64);
int64x2_t delta23 = vpaddq_s64(delta2_s64, delta3_s64);
int64x2_t src0 = vld1q_s64(src);
int64x2_t src1 = vld1q_s64(src + 2 );
vst1q_s64(dst, vaddq_s64(src0, delta01));
vst1q_s64(dst + 2 , vaddq_s64(src1, delta23));
#else
dst[0 ] = src[0 ] + horizontal_add_s64x2(delta0_s64);
dst[1 ] = src[1 ] + horizontal_add_s64x2(delta1_s64);
dst[2 ] = src[2 ] + horizontal_add_s64x2(delta2_s64);
dst[3 ] = src[3 ] + horizontal_add_s64x2(delta3_s64);
#endif
}
static inline void compute_stats_win5_highbd_neon(
const int16_t *const d, const int32_t d_stride, const int16_t *const s,
const int32_t s_stride, const int32_t width, const int32_t height,
int64_t *const M, int64_t *const H, aom_bit_depth_t bit_depth) {
const int32_t wiener_win = WIENER_WIN_CHROMA;
const int32_t wiener_win2 = wiener_win * wiener_win;
const int32_t w16 = width & ~15 ;
const int32_t h8 = height & ~7 ;
int16x8_t mask[2 ];
mask[0 ] = vld1q_s16(&(mask_16bit[16 ]) - width % 16 );
mask[1 ] = vld1q_s16(&(mask_16bit[16 ]) - width % 16 + 8 );
int32_t i, j, x, y;
const int32_t num_bit_left =
32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */;
const int32_t h_allowed =
(1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0 ));
// Step 1: Calculate the top edge of the whole matrix, i.e., the top
// edge of each triangle and square on the top row.
j = 0 ;
do {
const int16_t *s_t = s;
const int16_t *d_t = d;
int32_t height_t = 0 ;
int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0 ) };
int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0 ) };
int16x8_t src[2 ], dgd[2 ];
do {
const int32_t h_t =
((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
int32x4_t row_m[WIENER_WIN_CHROMA] = { vdupq_n_s32(0 ) };
int32x4_t row_h[WIENER_WIN_CHROMA] = { vdupq_n_s32(0 ) };
y = h_t;
do {
x = 0 ;
while (x < w16) {
src[0 ] = vld1q_s16(s_t + x + 0 );
src[1 ] = vld1q_s16(s_t + x + 8 );
dgd[0 ] = vld1q_s16(d_t + x + 0 );
dgd[1 ] = vld1q_s16(d_t + x + 8 );
stats_top_win5_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h);
x += 16 ;
}
if (w16 != width) {
src[0 ] = vld1q_s16(s_t + w16 + 0 );
src[1 ] = vld1q_s16(s_t + w16 + 8 );
dgd[0 ] = vld1q_s16(d_t + w16 + 0 );
dgd[1 ] = vld1q_s16(d_t + w16 + 8 );
src[0 ] = vandq_s16(src[0 ], mask[0 ]);
src[1 ] = vandq_s16(src[1 ], mask[1 ]);
dgd[0 ] = vandq_s16(dgd[0 ], mask[0 ]);
dgd[1 ] = vandq_s16(dgd[1 ], mask[1 ]);
stats_top_win5_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h);
}
s_t += s_stride;
d_t += d_stride;
} while (--y);
sum_m[0 ] = vpadalq_s32(sum_m[0 ], row_m[0 ]);
sum_m[1 ] = vpadalq_s32(sum_m[1 ], row_m[1 ]);
sum_m[2 ] = vpadalq_s32(sum_m[2 ], row_m[2 ]);
sum_m[3 ] = vpadalq_s32(sum_m[3 ], row_m[3 ]);
sum_m[4 ] = vpadalq_s32(sum_m[4 ], row_m[4 ]);
sum_h[0 ] = vpadalq_s32(sum_h[0 ], row_h[0 ]);
sum_h[1 ] = vpadalq_s32(sum_h[1 ], row_h[1 ]);
sum_h[2 ] = vpadalq_s32(sum_h[2 ], row_h[2 ]);
sum_h[3 ] = vpadalq_s32(sum_h[3 ], row_h[3 ]);
sum_h[4 ] = vpadalq_s32(sum_h[4 ], row_h[4 ]);
height_t += h_t;
} while (height_t < height);
#if AOM_ARCH_AARCH64
int64x2_t sum_m0 = vpaddq_s64(sum_m[0 ], sum_m[1 ]);
int64x2_t sum_m2 = vpaddq_s64(sum_m[2 ], sum_m[3 ]);
vst1q_s64(&M[wiener_win * j + 0 ], sum_m0);
vst1q_s64(&M[wiener_win * j + 2 ], sum_m2);
M[wiener_win * j + 4 ] = vaddvq_s64(sum_m[4 ]);
int64x2_t sum_h0 = vpaddq_s64(sum_h[0 ], sum_h[1 ]);
int64x2_t sum_h2 = vpaddq_s64(sum_h[2 ], sum_h[3 ]);
vst1q_s64(&H[wiener_win * j + 0 ], sum_h0);
vst1q_s64(&H[wiener_win * j + 2 ], sum_h2);
H[wiener_win * j + 4 ] = vaddvq_s64(sum_h[4 ]);
#else
M[wiener_win * j + 0 ] = horizontal_add_s64x2(sum_m[0 ]);
M[wiener_win * j + 1 ] = horizontal_add_s64x2(sum_m[1 ]);
M[wiener_win * j + 2 ] = horizontal_add_s64x2(sum_m[2 ]);
M[wiener_win * j + 3 ] = horizontal_add_s64x2(sum_m[3 ]);
M[wiener_win * j + 4 ] = horizontal_add_s64x2(sum_m[4 ]);
H[wiener_win * j + 0 ] = horizontal_add_s64x2(sum_h[0 ]);
H[wiener_win * j + 1 ] = horizontal_add_s64x2(sum_h[1 ]);
H[wiener_win * j + 2 ] = horizontal_add_s64x2(sum_h[2 ]);
H[wiener_win * j + 3 ] = horizontal_add_s64x2(sum_h[3 ]);
H[wiener_win * j + 4 ] = horizontal_add_s64x2(sum_h[4 ]);
#endif // AOM_ARCH_AARCH64
} while (++j < wiener_win);
// Step 2: Calculate the left edge of each square on the top row.
j = 1 ;
do {
const int16_t *d_t = d;
int32_t height_t = 0 ;
int64x2_t sum_h[WIENER_WIN_CHROMA - 1 ] = { vdupq_n_s64(0 ) };
int16x8_t dgd[2 ];
do {
const int32_t h_t =
((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
int32x4_t row_h[WIENER_WIN_CHROMA - 1 ] = { vdupq_n_s32(0 ) };
y = h_t;
do {
x = 0 ;
while (x < w16) {
dgd[0 ] = vld1q_s16(d_t + j + x + 0 );
dgd[1 ] = vld1q_s16(d_t + j + x + 8 );
stats_left_win5_neon(dgd, d_t + x, d_stride, row_h);
x += 16 ;
}
if (w16 != width) {
dgd[0 ] = vld1q_s16(d_t + j + x + 0 );
dgd[1 ] = vld1q_s16(d_t + j + x + 8 );
dgd[0 ] = vandq_s16(dgd[0 ], mask[0 ]);
dgd[1 ] = vandq_s16(dgd[1 ], mask[1 ]);
stats_left_win5_neon(dgd, d_t + x, d_stride, row_h);
}
d_t += d_stride;
} while (--y);
sum_h[0 ] = vpadalq_s32(sum_h[0 ], row_h[0 ]);
sum_h[1 ] = vpadalq_s32(sum_h[1 ], row_h[1 ]);
sum_h[2 ] = vpadalq_s32(sum_h[2 ], row_h[2 ]);
sum_h[3 ] = vpadalq_s32(sum_h[3 ], row_h[3 ]);
height_t += h_t;
} while (height_t < height);
#if AOM_ARCH_AARCH64
int64x2_t sum_h0 = vpaddq_s64(sum_h[0 ], sum_h[1 ]);
int64x2_t sum_h1 = vpaddq_s64(sum_h[2 ], sum_h[3 ]);
vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0));
vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0));
vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h1));
vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h1));
#else
H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0 ]);
H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1 ]);
H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2 ]);
H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3 ]);
#endif // AOM_ARCH_AARCH64
} while (++j < wiener_win);
// Step 3: Derive the top edge of each triangle along the diagonal. No
// triangle in top row.
{
const int16_t *d_t = d;
if (height % 2 ) {
int32x4_t deltas[(WIENER_WIN + 1 ) * 2 ] = { vdupq_n_s32(0 ) };
int32x4_t deltas_tr[(WIENER_WIN + 1 ) * 2 ] = { vdupq_n_s32(0 ) };
int16x8_t ds[WIENER_WIN * 2 ];
load_s16_8x4(d_t, d_stride, &ds[0 ], &ds[>2 ], &ds[4 ], &ds[6 ]);
load_s16_8x4(d_t + width, d_stride, &ds[1 ], &ds[3 ], &ds[5 ], &ds[7 ]);
d_t += 4 * d_stride;
step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
transpose_arrays_s32_8x8(deltas, deltas_tr);
update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
deltas_tr[0 ], vgetq_lane_s32(deltas_tr[4 ], 0 ),
H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
deltas_tr[1 ], vgetq_lane_s32(deltas_tr[5 ], 0 ),
H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
deltas_tr[2 ], vgetq_lane_s32(deltas_tr[6 ], 0 ),
H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
deltas_tr[3 ], vgetq_lane_s32(deltas_tr[7 ], 0 ),
H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
} else {
int32x4_t deltas[WIENER_WIN_CHROMA * 2 ] = { vdupq_n_s32(0 ) };
int16x8_t ds[WIENER_WIN_CHROMA * 2 ];
ds[0 ] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
ds[1 ] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
ds[2 ] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
ds[3 ] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
transpose_elems_inplace_s32_4x4(&deltas[0 ], &deltas[1 ], &deltas[2 ],
&deltas[3 ]);
update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
deltas[0 ], vgetq_lane_s32(deltas[4 ], 0 ),
H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
deltas[1 ], vgetq_lane_s32(deltas[4 ], 1 ),
H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
deltas[2 ], vgetq_lane_s32(deltas[4 ], 2 ),
H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
deltas[3 ], vgetq_lane_s32(deltas[4 ], 3 ),
H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
}
}
// Step 4: Derive the top and left edge of each square. No square in top and
// bottom row.
{
y = h8;
int16x4_t d_s[12 ];
int16x4_t d_e[12 ];
const int16_t *d_t = d;
int16x4_t zeros = vdup_n_s16(0 );
load_s16_4x4(d_t, d_stride, &d_s[0 ], &d_s[>1 ], &d_s[2 ], &d_s[3 ]);
load_s16_4x4(d_t + width, d_stride, &d_e[0 ], &d_e[1 ], &d_e[2 ], &d_e[3 ]);
int32x4_t deltas[6 ][18 ] = { { vdupq_n_s32(0 ) }, { vdupq_n_s32(0 ) } };
while (y >= 8 ) {
load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4 ], &d_s[5 ], &d_s[6 ],
&d_s[7 ], &d_s[8 ], &d_s[9 ], &d_s[10 ], &d_s[11 ]);
load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4 ], &d_e[le='color: green'>5 ],
&d_e[6 ], &d_e[7 ], &d_e[8 ], &d_e[9 ], &d_e[10 ], &d_e[11 ]);
int16x8_t s_tr[8 ], e_tr[8 ];
transpose_elems_s16_4x8(d_s[0 ], d_s[1 ], d_s[2 ], d_s[3 ], d_s[4 ], d_s[5 ],
d_s[6 ], d_s[7 ], &s_tr[0 ], &s_tr[1 ], &s_tr[le='color: green'>2 ],
&s_tr[3 ]);
transpose_elems_s16_4x8(d_s[8 ], d_s[9 ], d_s[10 ], d_s[11 ], zeros, zeros,
zeros, zeros, &s_tr[4 ], &s_tr[5 ], &s_tr[='color: green'>6 ],
&s_tr[7 ]);
transpose_elems_s16_4x8(d_e[0 ], d_e[1 ], d_e[2 ], d_e[3 ], d_e[4 ], d_e[5 ],
d_e[6 ], d_e[7 ], &e_tr[0 ], &e_tr[1 ], &e_tr[le='color: green'>2 ],
&e_tr[3 ]);
transpose_elems_s16_4x8(d_e[8 ], d_e[9 ], d_e[10 ], d_e[11 ], zeros, zeros,
zeros, zeros, &e_tr[4 ], &e_tr[5 ], &e_tr[='color: green'>6 ],
&e_tr[7 ]);
int16x8_t start_col0[5 ], start_col1[5 ], start_col2[5 ], start_col3[5 ];
start_col0[0 ] = s_tr[0 ];
start_col0[1 ] = vextq_s16(s_tr[0 ], s_tr[4 ], 1 );
start_col0[2 ] = vextq_s16(s_tr[0 ], s_tr[4 ], 2 );
start_col0[3 ] = vextq_s16(s_tr[0 ], s_tr[4 ], 3 );
start_col0[4 ] = vextq_s16(s_tr[0 ], s_tr[4 ], 4 );
start_col1[0 ] = s_tr[1 ];
start_col1[1 ] = vextq_s16(s_tr[1 ], s_tr[5 ], 1 );
start_col1[2 ] = vextq_s16(s_tr[1 ], s_tr[5 ], 2 );
start_col1[3 ] = vextq_s16(s_tr[1 ], s_tr[5 ], 3 );
start_col1[4 ] = vextq_s16(s_tr[1 ], s_tr[5 ], 4 );
start_col2[0 ] = s_tr[2 ];
start_col2[1 ] = vextq_s16(s_tr[2 ], s_tr[6 ], 1 );
start_col2[2 ] = vextq_s16(s_tr[2 ], s_tr[6 ], 2 );
start_col2[3 ] = vextq_s16(s_tr[2 ], s_tr[6 ], 3 );
start_col2[4 ] = vextq_s16(s_tr[2 ], s_tr[6 ], 4 );
start_col3[0 ] = s_tr[3 ];
start_col3[1 ] = vextq_s16(s_tr[3 ], s_tr[7 ], 1 );
start_col3[2 ] = vextq_s16(s_tr[3 ], s_tr[7 ], 2 );
start_col3[3 ] = vextq_s16(s_tr[3 ], s_tr[7 ], 3 );
start_col3[4 ] = vextq_s16(s_tr[3 ], s_tr[7 ], 4 );
// i = 1, j = 2;
sub_deltas_step4(start_col0, start_col1, deltas[0 ]);
// i = 1, j = 3;
sub_deltas_step4(start_col0, start_col2, deltas[1 ]);
// i = 1, j = 4
sub_deltas_step4(start_col0, start_col3, deltas[2 ]);
// i = 2, j =3
sub_deltas_step4(start_col1, start_col2, deltas[3 ]);
// i = 2, j = 4
sub_deltas_step4(start_col1, start_col3, deltas[4 ]);
// i = 3, j = 4
sub_deltas_step4(start_col2, start_col3, deltas[5 ]);
int16x8_t end_col0[5 ], end_col1[5 ], end_col2[5 ], end_col3[5 ];
end_col0[0 ] = e_tr[0 ];
end_col0[1 ] = vextq_s16(e_tr[0 ], e_tr[4 ], 1 );
end_col0[2 ] = vextq_s16(e_tr[0 ], e_tr[4 ], 2 );
end_col0[3 ] = vextq_s16(e_tr[0 ], e_tr[4 ], 3 );
end_col0[4 ] = vextq_s16(e_tr[0 ], e_tr[4 ], 4 );
end_col1[0 ] = e_tr[1 ];
end_col1[1 ] = vextq_s16(e_tr[1 ], e_tr[5 ], 1 );
end_col1[2 ] = vextq_s16(e_tr[1 ], e_tr[5 ], 2 );
end_col1[3 ] = vextq_s16(e_tr[1 ], e_tr[5 ], 3 );
end_col1[4 ] = vextq_s16(e_tr[1 ], e_tr[5 ], 4 );
end_col2[0 ] = e_tr[2 ];
end_col2[1 ] = vextq_s16(e_tr[2 ], e_tr[6 ], 1 );
end_col2[2 ] = vextq_s16(e_tr[2 ], e_tr[6 ], 2 );
end_col2[3 ] = vextq_s16(e_tr[2 ], e_tr[6 ], 3 );
end_col2[4 ] = vextq_s16(e_tr[2 ], e_tr[6 ], 4 );
end_col3[0 ] = e_tr[3 ];
end_col3[1 ] = vextq_s16(e_tr[3 ], e_tr[7 ], 1 );
end_col3[2 ] = vextq_s16(e_tr[3 ], e_tr[7 ], 2 );
end_col3[3 ] = vextq_s16(e_tr[3 ], e_tr[7 ], 3 );
end_col3[4 ] = vextq_s16(e_tr[3 ], e_tr[7 ], 4 );
// i = 1, j = 2;
add_deltas_step4(end_col0, end_col1, deltas[0 ]);
// i = 1, j = 3;
add_deltas_step4(end_col0, end_col2, deltas[1 ]);
// i = 1, j = 4
add_deltas_step4(end_col0, end_col3, deltas[2 ]);
// i = 2, j =3
add_deltas_step4(end_col1, end_col2, deltas[3 ]);
// i = 2, j = 4
add_deltas_step4(end_col1, end_col3, deltas[4 ]);
// i = 3, j = 4
add_deltas_step4(end_col2, end_col3, deltas[5 ]);
d_s[0 ] = d_s[8 ];
d_s[1 ] = d_s[9 ];
d_s[2 ] = d_s[10 ];
d_s[3 ] = d_s[11 ];
d_e[0 ] = d_e[8 ];
d_e[1 ] = d_e[9 ];
d_e[2 ] = d_e[10 ];
d_e[3 ] = d_e[11 ];
d_t += 8 * d_stride;
y -= 8 ;
}
if (h8 != height) {
const int16x8_t mask_h = vld1q_s16(&mask_16bit[16 ] - (height % 8 ));
load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4 ], &d_s[5 ], &d_s[6 ],
&d_s[7 ], &d_s[8 ], &d_s[9 ], &d_s[10 ], &d_s[11 ]);
load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4 ], &d_e[le='color: green'>5 ],
&d_e[6 ], &d_e[7 ], &d_e[8 ], &d_e[9 ], &d_e[10 ], &d_e[11 ]);
int16x8_t s_tr[8 ], e_tr[8 ];
transpose_elems_s16_4x8(d_s[0 ], d_s[1 ], d_s[2 ], d_s[3 ], d_s[4 ], d_s[5 ],
d_s[6 ], d_s[7 ], &s_tr[0 ], &s_tr[1 ], &s_tr[le='color: green'>2 ],
&s_tr[3 ]);
transpose_elems_s16_4x8(d_s[8 ], d_s[9 ], d_s[10 ], d_s[11 ], zeros, zeros,
zeros, zeros, &s_tr[4 ], &s_tr[5 ], &s_tr[='color: green'>6 ],
&s_tr[7 ]);
transpose_elems_s16_4x8(d_e[0 ], d_e[1 ], d_e[2 ], d_e[3 ], d_e[4 ], d_e[5 ],
d_e[6 ], d_e[7 ], &e_tr[0 ], &e_tr[1 ], &e_tr[le='color: green'>2 ],
&e_tr[3 ]);
transpose_elems_s16_4x8(d_e[8 ], d_e[9 ], d_e[10 ], d_e[11 ], zeros, zeros,
zeros, zeros, &e_tr[4 ], &e_tr[5 ], &e_tr[='color: green'>6 ],
&e_tr[7 ]);
int16x8_t start_col0[5 ], start_col1[5 ], start_col2[5 ], start_col3[5 ];
start_col0[0 ] = vandq_s16(s_tr[0 ], mask_h);
start_col0[1 ] = vandq_s16(vextq_s16(s_tr[0 ], s_tr[4 ], 1 ), mask_h);
start_col0[2 ] = vandq_s16(vextq_s16(s_tr[0 ], s_tr[4 ], 2 ), mask_h);
start_col0[3 ] = vandq_s16(vextq_s16(s_tr[0 ], s_tr[4 ], 3 ), mask_h);
start_col0[4 ] = vandq_s16(vextq_s16(s_tr[0 ], s_tr[4 ], 4 ), mask_h);
start_col1[0 ] = vandq_s16(s_tr[1 ], mask_h);
start_col1[1 ] = vandq_s16(vextq_s16(s_tr[1 ], s_tr[5 ], 1 ), mask_h);
start_col1[2 ] = vandq_s16(vextq_s16(s_tr[1 ], s_tr[5 ], 2 ), mask_h);
start_col1[3 ] = vandq_s16(vextq_s16(s_tr[1 ], s_tr[5 ], 3 ), mask_h);
start_col1[4 ] = vandq_s16(vextq_s16(s_tr[1 ], s_tr[5 ], 4 ), mask_h);
start_col2[0 ] = vandq_s16(s_tr[2 ], mask_h);
start_col2[1 ] = vandq_s16(vextq_s16(s_tr[2 ], s_tr[6 ], 1 ), mask_h);
start_col2[2 ] = vandq_s16(vextq_s16(s_tr[2 ], s_tr[6 ], 2 ), mask_h);
start_col2[3 ] = vandq_s16(vextq_s16(s_tr[2 ], s_tr[6 ], 3 ), mask_h);
start_col2[4 ] = vandq_s16(vextq_s16(s_tr[2 ], s_tr[6 ], 4 ), mask_h);
start_col3[0 ] = vandq_s16(s_tr[3 ], mask_h);
start_col3[1 ] = vandq_s16(vextq_s16(s_tr[3 ], s_tr[7 ], 1 ), mask_h);
start_col3[2 ] = vandq_s16(vextq_s16(s_tr[3 ], s_tr[7 ], 2 ), mask_h);
start_col3[3 ] = vandq_s16(vextq_s16(s_tr[3 ], s_tr[7 ], 3 ), mask_h);
start_col3[4 ] = vandq_s16(vextq_s16(s_tr[3 ], s_tr[7 ], 4 ), mask_h);
// i = 1, j = 2;
sub_deltas_step4(start_col0, start_col1, deltas[0 ]);
// i = 1, j = 3;
sub_deltas_step4(start_col0, start_col2, deltas[1 ]);
// i = 1, j = 4
sub_deltas_step4(start_col0, start_col3, deltas[2 ]);
// i = 2, j = 3
sub_deltas_step4(start_col1, start_col2, deltas[3 ]);
// i = 2, j = 4
sub_deltas_step4(start_col1, start_col3, deltas[4 ]);
// i = 3, j = 4
sub_deltas_step4(start_col2, start_col3, deltas[5 ]);
int16x8_t end_col0[5 ], end_col1[5 ], end_col2[5 ], end_col3[5 ];
end_col0[0 ] = vandq_s16(e_tr[0 ], mask_h);
end_col0[1 ] = vandq_s16(vextq_s16(e_tr[0 ], e_tr[4 ], 1 ), mask_h);
end_col0[2 ] = vandq_s16(vextq_s16(e_tr[0 ], e_tr[4 ], 2 ), mask_h);
end_col0[3 ] = vandq_s16(vextq_s16(e_tr[0 ], e_tr[4 ], 3 ), mask_h);
end_col0[4 ] = vandq_s16(vextq_s16(e_tr[0 ], e_tr[4 ], 4 ), mask_h);
end_col1[0 ] = vandq_s16(e_tr[1 ], mask_h);
end_col1[1 ] = vandq_s16(vextq_s16(e_tr[1 ], e_tr[5 ], 1 ), mask_h);
end_col1[2 ] = vandq_s16(vextq_s16(e_tr[1 ], e_tr[5 ], 2 ), mask_h);
end_col1[3 ] = vandq_s16(vextq_s16(e_tr[1 ], e_tr[5 ], 3 ), mask_h);
end_col1[4 ] = vandq_s16(vextq_s16(e_tr[1 ], e_tr[5 ], 4 ), mask_h);
end_col2[0 ] = vandq_s16(e_tr[2 ], mask_h);
end_col2[1 ] = vandq_s16(vextq_s16(e_tr[2 ], e_tr[6 ], 1 ), mask_h);
end_col2[2 ] = vandq_s16(vextq_s16(e_tr[2 ], e_tr[6 ], 2 ), mask_h);
end_col2[3 ] = vandq_s16(vextq_s16(e_tr[2 ], e_tr[6 ], 3 ), mask_h);
end_col2[4 ] = vandq_s16(vextq_s16(e_tr[2 ], e_tr[6 ], 4 ), mask_h);
end_col3[0 ] = vandq_s16(e_tr[3 ], mask_h);
end_col3[1 ] = vandq_s16(vextq_s16(e_tr[3 ], e_tr[7 ], 1 ), mask_h);
end_col3[2 ] = vandq_s16(vextq_s16(e_tr[3 ], e_tr[7 ], 2 ), mask_h);
end_col3[3 ] = vandq_s16(vextq_s16(e_tr[3 ], e_tr[7 ], 3 ), mask_h);
end_col3[4 ] = vandq_s16(vextq_s16(e_tr[3 ], e_tr[7 ], 4 ), mask_h);
// i = 1, j = 2;
add_deltas_step4(end_col0, end_col1, deltas[0 ]);
// i = 1, j = 3;
add_deltas_step4(end_col0, end_col2, deltas[1 ]);
// i = 1, j = 4
add_deltas_step4(end_col0, end_col3, deltas[2 ]);
// i = 2, j =3
add_deltas_step4(end_col1, end_col2, deltas[3 ]);
// i = 2, j = 4
add_deltas_step4(end_col1, end_col3, deltas[4 ]);
// i = 3, j = 4
add_deltas_step4(end_col2, end_col3, deltas[5 ]);
}
int32x4_t delta[6 ][2 ];
int32_t single_delta[6 ];
delta[0 ][0 ] = horizontal_add_4d_s32x4(&deltas[0 ][0 ]);
delta[1 ][0 ] = horizontal_add_4d_s32x4(&deltas[1 ][0 ]);
delta[2 ][0 ] = horizontal_add_4d_s32x4(&deltas[2 ][0 ]);
delta[3 ][0 ] = horizontal_add_4d_s32x4(&deltas[3 ][0 ]);
delta[4 ][0 ] = horizontal_add_4d_s32x4(&deltas[4 ][0 ]);
delta[5 ][0 ] = horizontal_add_4d_s32x4(&deltas[5 ][0 ]);
delta[0 ][1 ] = horizontal_add_4d_s32x4(&deltas[0 ][5 ]);
delta[1 ][1 ] = horizontal_add_4d_s32x4(&deltas[1 ][5 ]);
delta[2 ][1 ] = horizontal_add_4d_s32x4(&deltas[2 ][5 ]);
delta[3 ][1 ] = horizontal_add_4d_s32x4(&deltas[3 ][5 ]);
delta[4 ][1 ] = horizontal_add_4d_s32x4(&deltas[4 ][5 ]);
delta[5 ][1 ] = horizontal_add_4d_s32x4(&deltas[5 ][5 ]);
single_delta[0 ] = horizontal_add_s32x4(deltas[0 ][4 ]);
single_delta[1 ] = horizontal_add_s32x4(deltas[1 ][4 ]);
single_delta[2 ] = horizontal_add_s32x4(deltas[2 ][4 ]);
single_delta[3 ] = horizontal_add_s32x4(deltas[3 ][4 ]);
single_delta[4 ] = horizontal_add_s32x4(deltas[4 ][4 ]);
single_delta[5 ] = horizontal_add_s32x4(deltas[5 ][4 ]);
int idx = 0 ;
for (i = 1 ; i < wiener_win - 1 ; i++) {
for (j = i + 1 ; j < wiener_win; j++) {
update_4_stats_neon(
H + (i - 1 ) * wiener_win * wiener_win2 + (j - 1 ) * wiener_win,
delta[idx][0 ], H + i * wiener_win * wiener_win2 + j * wiener_win);
H[i * wiener_win * wiener_win2 + j * wiener_win + 4 ] =
H[(i - 1 ) * wiener_win * wiener_win2 + (j - 1 ) * wiener_win + 4 ] +
single_delta[idx];
H[(i * wiener_win + 1 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 1 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(delta[idx][1 ], 0 );
H[(i * wiener_win + 2 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 2 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(delta[idx][1 ], 1 );
H[(i * wiener_win + 3 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 3 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(delta[idx][1 ], 2 );
H[(i * wiener_win + 4 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 4 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(delta[idx][1 ], 3 );
idx++;
}
}
}
// Step 5: Derive other points of each square. No square in bottom row.
i = 0 ;
do {
const int16_t *const di = d + i;
j = i + 1 ;
do {
const int16_t *const dj = d + j;
int32x4_t deltas[WIENER_WIN_CHROMA - 1 ][WIENER_WIN_CHROMA - 1 ] = {
{ vdupq_n_s32(0 ) }, { vdupq_n_s32(0 ) }
};
int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA];
x = 0 ;
while (x < w16) {
load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
d_js, d_je);
derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas);
x += 16 ;
}
if (w16 != width) {
load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
d_js, d_je);
d_is[0 ] = vandq_s16(d_is[0 ], mask[0 ]);
d_is[1 ] = vandq_s16(d_is[1 ], mask[1 ]);
d_is[2 ] = vandq_s16(d_is[2 ], mask[0 ]);
d_is[3 ] = vandq_s16(d_is[3 ], mask[1 ]);
d_is[4 ] = vandq_s16(d_is[4 ], mask[0 ]);
d_is[5 ] = vandq_s16(d_is[5 ], mask[1 ]);
d_is[6 ] = vandq_s16(d_is[6 ], mask[0 ]);
d_is[7 ] = vandq_s16(d_is[7 ], mask[1 ]);
d_ie[0 ] = vandq_s16(d_ie[0 ], mask[0 ]);
d_ie[1 ] = vandq_s16(d_ie[1 ], mask[1 ]);
d_ie[2 ] = vandq_s16(d_ie[2 ], mask[0 ]);
d_ie[3 ] = vandq_s16(d_ie[3 ], mask[1 ]);
d_ie[4 ] = vandq_s16(d_ie[4 ], mask[0 ]);
d_ie[5 ] = vandq_s16(d_ie[5 ], mask[1 ]);
d_ie[6 ] = vandq_s16(d_ie[6 ], mask[0 ]);
d_ie[7 ] = vandq_s16(d_ie[7 ], mask[1 ]);
derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas);
}
hadd_update_4_stats_neon(
H + (i * wiener_win + 0 ) * wiener_win2 + j * wiener_win, deltas[0 ],
H + (i * wiener_win + 1 ) * wiener_win2 + j * wiener_win + 1 );
hadd_update_4_stats_neon(
H + (i * wiener_win + 1 ) * wiener_win2 + j * wiener_win, deltas[1 ],
H + (i * wiener_win + 2 ) * wiener_win2 + j * wiener_win + 1 );
hadd_update_4_stats_neon(
H + (i * wiener_win + 2 ) * wiener_win2 + j * wiener_win, deltas[2 ],
H + (i * wiener_win + 3 ) * wiener_win2 + j * wiener_win + 1 );
hadd_update_4_stats_neon(
H + (i * wiener_win + 3 ) * wiener_win2 + j * wiener_win, deltas[3 ],
H + (i * wiener_win + 4 ) * wiener_win2 + j * wiener_win + 1 );
} while (++j < wiener_win);
} while (++i < wiener_win - 1 );
// Step 6: Derive other points of each upper triangle along the diagonal.
i = 0 ;
do {
const int16_t *const di = d + i;
int32x4_t deltas[WIENER_WIN_CHROMA * 2 + 1 ] = { vdupq_n_s32(0 ) };
int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
x = 0 ;
while (x < w16) {
load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
derive_triangle_win5_neon(d_is, d_ie, deltas);
x += 16 ;
}
if (w16 != width) {
load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
d_is[0 ] = vandq_s16(d_is[0 ], mask[0 ]);
d_is[1 ] = vandq_s16(d_is[1 ], mask[1 ]);
d_is[2 ] = vandq_s16(d_is[2 ], mask[0 ]);
d_is[3 ] = vandq_s16(d_is[3 ], mask[1 ]);
d_is[4 ] = vandq_s16(d_is[4 ], mask[0 ]);
d_is[5 ] = vandq_s16(d_is[5 ], mask[1 ]);
d_is[6 ] = vandq_s16(d_is[6 ], mask[0 ]);
d_is[7 ] = vandq_s16(d_is[7 ], mask[1 ]);
d_ie[0 ] = vandq_s16(d_ie[0 ], mask[0 ]);
d_ie[1 ] = vandq_s16(d_ie[1 ], mask[1 ]);
d_ie[2 ] = vandq_s16(d_ie[2 ], mask[0 ]);
d_ie[3 ] = vandq_s16(d_ie[3 ], mask[1 ]);
d_ie[4 ] = vandq_s16(d_ie[4 ], mask[0 ]);
d_ie[5 ] = vandq_s16(d_ie[5 ], mask[1 ]);
d_ie[6 ] = vandq_s16(d_ie[6 ], mask[0 ]);
d_ie[7 ] = vandq_s16(d_ie[7 ], mask[1 ]);
derive_triangle_win5_neon(d_is, d_ie, deltas);
}
// Row 1: 4 points
hadd_update_4_stats_neon(
H + (i * wiener_win + 0 ) * wiener_win2 + i * wiener_win, deltas,
H + (i * wiener_win + 1 ) * wiener_win2 + i * wiener_win + 1 );
// Row 2: 3 points
int64x2_t delta4_s64 = vpaddlq_s32(deltas[4 ]);
int64x2_t delta5_s64 = vpaddlq_s32(deltas[5 ]);
#if AOM_ARCH_AARCH64
int64x2_t deltas45 = vpaddq_s64(delta4_s64, delta5_s64);
int64x2_t src =
vld1q_s64(H + (i * wiener_win + 1 ) * wiener_win2 + i * wiener_win + 1 );
int64x2_t dst = vaddq_s64(src, deltas45);
vst1q_s64(H + (i * wiener_win + 2 ) * wiener_win2 + i * wiener_win + 2 , dst);
#else
H[(i * wiener_win + 2 ) * wiener_win2 + i * wiener_win + 2 + 0 ] =
H[(i * wiener_win + 1 ) * wiener_win2 + i * wiener_win + 1 + 0 ] +
horizontal_add_s64x2(delta4_s64);
H[(i * wiener_win + 2 ) * wiener_win2 + i * wiener_win + 2 + 1 ] =
H[(i * wiener_win + 1 ) * wiener_win2 + i * wiener_win + 1 + 1 ] +
horizontal_add_s64x2(delta5_s64);
#endif // AOM_ARCH_AARCH64
H[(i * wiener_win + 2 ) * wiener_win2 + i * wiener_win + 4 ] =
H[(i * wiener_win + 1 ) * wiener_win2 + i * wiener_win + 3 ] +
horizontal_long_add_s32x4(deltas[6 ]);
// Row 3: 2 points
int64x2_t delta7_s64 = vpaddlq_s32(deltas[7 ]);
int64x2_t delta8_s64 = vpaddlq_s32(deltas[8 ]);
#if AOM_ARCH_AARCH64
int64x2_t deltas78 = vpaddq_s64(delta7_s64, delta8_s64);
vst1q_s64(H + (i * wiener_win + 3 ) * wiener_win2 + i * wiener_win + 3 ,
vaddq_s64(dst, deltas78));
#else
H[(i * wiener_win + 3 ) * wiener_win2 + i * wiener_win + 3 + 0 ] =
H[(i * wiener_win + 2 ) * wiener_win2 + i * wiener_win + 2 + 0 ] +
horizontal_add_s64x2(delta7_s64);
H[(i * wiener_win + 3 ) * wiener_win2 + i * wiener_win + 3 + 1 ] =
H[(i * wiener_win + 2 ) * wiener_win2 + i * wiener_win + 2 + 1 ] +
horizontal_add_s64x2(delta8_s64);
#endif // AOM_ARCH_AARCH64
// Row 4: 1 point
H[(i * wiener_win + 4 ) * wiener_win2 + i * wiener_win + 4 ] =
H[(i * wiener_win + 3 ) * wiener_win2 + i * wiener_win + 3 ] +
horizontal_long_add_s32x4(deltas[9 ]);
} while (++i < wiener_win);
}
static inline void hadd_update_6_stats_neon(const int64_t *const src,
const int32x4_t *deltas,
int64_t *const dst) {
int64x2_t delta0_s64 = vpaddlq_s32(deltas[0 ]);
int64x2_t delta1_s64 = vpaddlq_s32(deltas[1 ]);
int64x2_t delta2_s64 = vpaddlq_s32(deltas[2 ]);
int64x2_t delta3_s64 = vpaddlq_s32(deltas[3 ]);
int64x2_t delta4_s64 = vpaddlq_s32(deltas[4 ]);
int64x2_t delta5_s64 = vpaddlq_s32(deltas[5 ]);
#if AOM_ARCH_AARCH64
int64x2_t delta01 = vpaddq_s64(delta0_s64, delta1_s64);
int64x2_t delta23 = vpaddq_s64(delta2_s64, delta3_s64);
int64x2_t delta45 = vpaddq_s64(delta4_s64, delta5_s64);
int64x2_t src0 = vld1q_s64(src);
int64x2_t src1 = vld1q_s64(src + 2 );
int64x2_t src2 = vld1q_s64(src + 4 );
vst1q_s64(dst, vaddq_s64(src0, delta01));
vst1q_s64(dst + 2 , vaddq_s64(src1, delta23));
vst1q_s64(dst + 4 , vaddq_s64(src2, delta45));
#else
dst[0 ] = src[0 ] + horizontal_add_s64x2(delta0_s64);
dst[1 ] = src[1 ] + horizontal_add_s64x2(delta1_s64);
dst[2 ] = src[2 ] + horizontal_add_s64x2(delta2_s64);
dst[3 ] = src[3 ] + horizontal_add_s64x2(delta3_s64);
dst[4 ] = src[4 ] + horizontal_add_s64x2(delta4_s64);
dst[5 ] = src[5 ] + horizontal_add_s64x2(delta5_s64);
#endif
}
static inline void compute_stats_win7_highbd_neon(
const int16_t *const d, const int32_t d_stride, const int16_t *const s,
const int32_t s_stride, const int32_t width, const int32_t height,
int64_t *const M, int64_t *const H, aom_bit_depth_t bit_depth) {
const int32_t wiener_win = WIENER_WIN;
const int32_t wiener_win2 = wiener_win * wiener_win;
const int32_t w16 = width & ~15 ;
const int32_t h8 = height & ~7 ;
int16x8_t mask[2 ];
mask[0 ] = vld1q_s16(&(mask_16bit[16 ]) - width % 16 );
mask[1 ] = vld1q_s16(&(mask_16bit[16 ]) - width % 16 + 8 );
int32_t i, j, x, y;
const int32_t num_bit_left =
32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */;
const int32_t h_allowed =
(1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0 ));
// Step 1: Calculate the top edge of the whole matrix, i.e., the top
// edge of each triangle and square on the top row.
j = 0 ;
do {
const int16_t *s_t = s;
const int16_t *d_t = d;
int32_t height_t = 0 ;
int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0 ) };
int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0 ) };
int16x8_t src[2 ], dgd[2 ];
do {
const int32_t h_t =
((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
int32x4_t row_m[WIENER_WIN * 2 ] = { vdupq_n_s32(0 ) };
int32x4_t row_h[WIENER_WIN * 2 ] = { vdupq_n_s32(0 ) };
y = h_t;
do {
x = 0 ;
while (x < w16) {
src[0 ] = vld1q_s16(s_t + x);
src[1 ] = vld1q_s16(s_t + x + 8 );
dgd[0 ] = vld1q_s16(d_t + x);
dgd[1 ] = vld1q_s16(d_t + x + 8 );
stats_top_win7_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h);
x += 16 ;
}
if (w16 != width) {
src[0 ] = vld1q_s16(s_t + w16);
src[1 ] = vld1q_s16(s_t + w16 + 8 );
dgd[0 ] = vld1q_s16(d_t + w16);
dgd[1 ] = vld1q_s16(d_t + w16 + 8 );
src[0 ] = vandq_s16(src[0 ], mask[0 ]);
src[1 ] = vandq_s16(src[1 ], mask[1 ]);
dgd[0 ] = vandq_s16(dgd[0 ], mask[0 ]);
dgd[1 ] = vandq_s16(dgd[1 ], mask[1 ]);
stats_top_win7_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h);
}
s_t += s_stride;
d_t += d_stride;
} while (--y);
sum_m[0 ] = vpadalq_s32(sum_m[0 ], row_m[0 ]);
sum_m[1 ] = vpadalq_s32(sum_m[1 ], row_m[1 ]);
sum_m[2 ] = vpadalq_s32(sum_m[2 ], row_m[2 ]);
sum_m[3 ] = vpadalq_s32(sum_m[3 ], row_m[3 ]);
sum_m[4 ] = vpadalq_s32(sum_m[4 ], row_m[4 ]);
sum_m[5 ] = vpadalq_s32(sum_m[5 ], row_m[5 ]);
sum_m[6 ] = vpadalq_s32(sum_m[6 ], row_m[6 ]);
sum_h[0 ] = vpadalq_s32(sum_h[0 ], row_h[0 ]);
sum_h[1 ] = vpadalq_s32(sum_h[1 ], row_h[1 ]);
sum_h[2 ] = vpadalq_s32(sum_h[2 ], row_h[2 ]);
sum_h[3 ] = vpadalq_s32(sum_h[3 ], row_h[3 ]);
sum_h[4 ] = vpadalq_s32(sum_h[4 ], row_h[4 ]);
sum_h[5 ] = vpadalq_s32(sum_h[5 ], row_h[5 ]);
sum_h[6 ] = vpadalq_s32(sum_h[6 ], row_h[6 ]);
height_t += h_t;
} while (height_t < height);
#if AOM_ARCH_AARCH64
vst1q_s64(M + wiener_win * j + 0 , vpaddq_s64(sum_m[0 ], sum_m[1 ]));
vst1q_s64(M + wiener_win * j + 2 , vpaddq_s64(sum_m[2 ], sum_m[3 ]));
vst1q_s64(M + wiener_win * j + 4 , vpaddq_s64(sum_m[4 ], sum_m[5 ]));
M[wiener_win * j + 6 ] = vaddvq_s64(sum_m[6 ]);
vst1q_s64(H + wiener_win * j + 0 , vpaddq_s64(sum_h[0 ], sum_h[1 ]));
vst1q_s64(H + wiener_win * j + 2 , vpaddq_s64(sum_h[2 ], sum_h[3 ]));
vst1q_s64(H + wiener_win * j + 4 , vpaddq_s64(sum_h[4 ], sum_h[5 ]));
H[wiener_win * j + 6 ] = vaddvq_s64(sum_h[6 ]);
#else
M[wiener_win * j + 0 ] = horizontal_add_s64x2(sum_m[0 ]);
M[wiener_win * j + 1 ] = horizontal_add_s64x2(sum_m[1 ]);
M[wiener_win * j + 2 ] = horizontal_add_s64x2(sum_m[2 ]);
M[wiener_win * j + 3 ] = horizontal_add_s64x2(sum_m[3 ]);
M[wiener_win * j + 4 ] = horizontal_add_s64x2(sum_m[4 ]);
M[wiener_win * j + 5 ] = horizontal_add_s64x2(sum_m[5 ]);
M[wiener_win * j + 6 ] = horizontal_add_s64x2(sum_m[6 ]);
H[wiener_win * j + 0 ] = horizontal_add_s64x2(sum_h[0 ]);
H[wiener_win * j + 1 ] = horizontal_add_s64x2(sum_h[1 ]);
H[wiener_win * j + 2 ] = horizontal_add_s64x2(sum_h[2 ]);
H[wiener_win * j + 3 ] = horizontal_add_s64x2(sum_h[3 ]);
H[wiener_win * j + 4 ] = horizontal_add_s64x2(sum_h[4 ]);
H[wiener_win * j + 5 ] = horizontal_add_s64x2(sum_h[5 ]);
H[wiener_win * j + 6 ] = horizontal_add_s64x2(sum_h[6 ]);
#endif // AOM_ARCH_AARCH64
} while (++j < wiener_win);
// Step 2: Calculate the left edge of each square on the top row.
j = 1 ;
do {
const int16_t *d_t = d;
int32_t height_t = 0 ;
int64x2_t sum_h[WIENER_WIN - 1 ] = { vdupq_n_s64(0 ) };
int16x8_t dgd[2 ];
do {
const int32_t h_t =
((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
int32x4_t row_h[WIENER_WIN - 1 ] = { vdupq_n_s32(0 ) };
y = h_t;
do {
x = 0 ;
while (x < w16) {
dgd[0 ] = vld1q_s16(d_t + j + x + 0 );
dgd[1 ] = vld1q_s16(d_t + j + x + 8 );
stats_left_win7_neon(dgd, d_t + x, d_stride, row_h);
x += 16 ;
}
if (w16 != width) {
dgd[0 ] = vld1q_s16(d_t + j + x + 0 );
dgd[1 ] = vld1q_s16(d_t + j + x + 8 );
dgd[0 ] = vandq_s16(dgd[0 ], mask[0 ]);
dgd[1 ] = vandq_s16(dgd[1 ], mask[1 ]);
stats_left_win7_neon(dgd, d_t + x, d_stride, row_h);
}
d_t += d_stride;
} while (--y);
sum_h[0 ] = vpadalq_s32(sum_h[0 ], row_h[0 ]);
sum_h[1 ] = vpadalq_s32(sum_h[1 ], row_h[1 ]);
sum_h[2 ] = vpadalq_s32(sum_h[2 ], row_h[2 ]);
sum_h[3 ] = vpadalq_s32(sum_h[3 ], row_h[3 ]);
sum_h[4 ] = vpadalq_s32(sum_h[4 ], row_h[4 ]);
sum_h[5 ] = vpadalq_s32(sum_h[5 ], row_h[5 ]);
height_t += h_t;
} while (height_t < height);
#if AOM_ARCH_AARCH64
int64x2_t sum_h0 = vpaddq_s64(sum_h[0 ], sum_h[1 ]);
int64x2_t sum_h2 = vpaddq_s64(sum_h[2 ], sum_h[3 ]);
int64x2_t sum_h4 = vpaddq_s64(sum_h[4 ], sum_h[5 ]);
vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0));
vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0));
vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h2));
vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h2));
vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h4));
vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h4));
#else
H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0 ]);
H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1 ]);
H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2 ]);
H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3 ]);
H[5 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[4 ]);
H[6 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[5 ]);
#endif // AOM_ARCH_AARCH64
} while (++j < wiener_win);
// Step 3: Derive the top edge of each triangle along the diagonal. No
// triangle in top row.
{
const int16_t *d_t = d;
// Pad to call transpose function.
int32x4_t deltas[(WIENER_WIN + 1 ) * 2 ] = { vdupq_n_s32(0 ) };
int32x4_t deltas_tr[(WIENER_WIN + 1 ) * 2 ] = { vdupq_n_s32(0 ) };
int16x8_t ds[WIENER_WIN * 2 ];
load_s16_8x6(d_t, d_stride, &ds[0 ], &ds[>2 ], &ds[4 ], &ds[6 ], &ds[8 ],
&ds[10 ]);
load_s16_8x6(d_t + width, d_stride, &ds[1 ], &ds[3 ], &ds[5 ], &ds[7 ], &ds[9 ],
&ds[11 ]);
d_t += 6 * d_stride;
step3_win7_neon(d_t, d_stride, width, height, ds, deltas);
transpose_arrays_s32_8x8(deltas, deltas_tr);
update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
deltas_tr[0 ], deltas_tr[4 ],
H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
deltas_tr[1 ], deltas_tr[5 ],
H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
deltas_tr[2 ], deltas_tr[6 ],
H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
deltas_tr[3 ], deltas_tr[7 ],
H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win,
deltas_tr[8 ], deltas_tr[12 ],
H + 5 * wiener_win * wiener_win2 + 5 * wiener_win);
update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win,
deltas_tr[9 ], deltas_tr[13 ],
H + 6 * wiener_win * wiener_win2 + 6 * wiener_win);
}
// Step 4: Derive the top and left edge of each square. No square in top and
// bottom row.
i = 1 ;
do {
j = i + 1 ;
do {
const int16_t *di = d + i - 1 ;
const int16_t *dj = d + j - 1 ;
int32x4_t deltas[(2 * WIENER_WIN - 1 ) * 2 ] = { vdupq_n_s32(0 ) };
int16x8_t dd[WIENER_WIN * 2 ], ds[WIENER_WIN * 2 ];
dd[5 ] = vdupq_n_s16(0 ); // Initialize to avoid warning.
const int16_t dd0_values[] = { di[0 * d_stride],
di[1 * d_stride],
di[2 * d_stride],
di[3 * d_stride],
di[4 * d_stride],
di[5 * d_stride],
0 ,
0 };
dd[0 ] = vld1q_s16(dd0_values);
const int16_t dd1_values[] = { di[0 * d_stride + width],
di[1 * d_stride + width],
di[2 * d_stride + width],
di[3 * d_stride + width],
di[4 * d_stride + width],
di[5 * d_stride + width],
0 ,
0 };
dd[1 ] = vld1q_s16(dd1_values);
const int16_t ds0_values[] = { dj[0 * d_stride],
dj[1 * d_stride],
dj[2 * d_stride],
dj[3 * d_stride],
dj[4 * d_stride],
dj[5 * d_stride],
0 ,
0 };
ds[0 ] = vld1q_s16(ds0_values);
int16_t ds1_values[] = { dj[0 * d_stride + width],
dj[1 * d_stride + width],
dj[2 * d_stride + width],
dj[3 * d_stride + width],
dj[4 * d_stride + width],
dj[5 * d_stride + width],
0 ,
0 };
ds[1 ] = vld1q_s16(ds1_values);
y = 0 ;
while (y < h8) {
// 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e
dd[0 ] = vsetq_lane_s16(di[6 * d_stride], dd[0 ], 6 );
dd[0 ] = vsetq_lane_s16(di[7 * d_stride], dd[0 ], 7 );
dd[1 ] = vsetq_lane_s16(di[6 * d_stride + width], dd[1 ], 6 );
dd[1 ] = vsetq_lane_s16(di[7 * d_stride + width], dd[1 ], 7 );
// 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e
// 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e 71e
ds[0 ] = vsetq_lane_s16(dj[6 * d_stride], ds[0 ], 6 );
ds[0 ] = vsetq_lane_s16(dj[7 * d_stride], ds[0 ], 7 );
ds[1 ] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1 ], 6 );
ds[1 ] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1 ], 7 );
load_more_16_neon(di + 8 * d_stride, width, &dd[0 ], &dd[2 ]);
load_more_16_neon(dj + 8 * d_stride, width, &ds[0 ], &ds[2 ]);
load_more_16_neon(di + 9 * d_stride, width, &dd[2 ], &dd[4 ]);
load_more_16_neon(dj + 9 * d_stride, width, &ds[2 ], &ds[4 ]);
load_more_16_neon(di + 10 * d_stride, width, &dd[4 ], &dd[6 ]);
load_more_16_neon(dj + 10 * d_stride, width, &ds[4 ], &ds[6 ]);
load_more_16_neon(di + 11 * d_stride, width, &dd[6 ], &dd[8 ]);
load_more_16_neon(dj + 11 * d_stride, width, &ds[6 ], &ds[8 ]);
load_more_16_neon(di + 12 * d_stride, width, &dd[8 ], &dd[10 ]);
load_more_16_neon(dj + 12 * d_stride, width, &ds[8 ], &ds[10 ]);
load_more_16_neon(di + 13 * d_stride, width, &dd[10 ], &dd['color: green'>12 ]);
load_more_16_neon(dj + 13 * d_stride, width, &ds[10 ], &ds['color: green'>12 ]);
madd_neon(&deltas[0 ], dd[0 ], ds[0 ]);
madd_neon(&deltas[1 ], dd[1 ], ds[1 ]);
madd_neon(&deltas[2 ], dd[0 ], ds[2 ]);
madd_neon(&deltas[3 ], dd[1 ], ds[3 ]);
madd_neon(&deltas[4 ], dd[0 ], ds[4 ]);
madd_neon(&deltas[5 ], dd[1 ], ds[5 ]);
madd_neon(&deltas[6 ], dd[0 ], ds[6 ]);
madd_neon(&deltas[7 ], dd[1 ], ds[7 ]);
madd_neon(&deltas[8 ], dd[0 ], ds[8 ]);
madd_neon(&deltas[9 ], dd[1 ], ds[9 ]);
madd_neon(&deltas[10 ], dd[0 ], ds[10 ]);
madd_neon(&deltas[11 ], dd[1 ], ds[11 ]);
madd_neon(&deltas[12 ], dd[0 ], ds[12 ]);
madd_neon(&deltas[13 ], dd[1 ], ds[13 ]);
madd_neon(&deltas[14 ], dd[2 ], ds[0 ]);
madd_neon(&deltas[15 ], dd[3 ], ds[1 ]);
madd_neon(&deltas[16 ], dd[4 ], ds[0 ]);
madd_neon(&deltas[17 ], dd[5 ], ds[1 ]);
madd_neon(&deltas[18 ], dd[6 ], ds[0 ]);
madd_neon(&deltas[19 ], dd[7 ], ds[1 ]);
madd_neon(&deltas[20 ], dd[8 ], ds[0 ]);
madd_neon(&deltas[21 ], dd[9 ], ds[1 ]);
madd_neon(&deltas[22 ], dd[10 ], ds[0 ]);
madd_neon(&deltas[23 ], dd[11 ], ds[1 ]);
madd_neon(&deltas[24 ], dd[12 ], ds[0 ]);
madd_neon(&deltas[25 ], dd[13 ], ds[1 ]);
dd[0 ] = vextq_s16(dd[12 ], vdupq_n_s16(0 ), 2 );
dd[1 ] = vextq_s16(dd[13 ], vdupq_n_s16(0 ), 2 );
ds[0 ] = vextq_s16(ds[12 ], vdupq_n_s16(0 ), 2 );
ds[1 ] = vextq_s16(ds[13 ], vdupq_n_s16(0 ), 2 );
di += 8 * d_stride;
dj += 8 * d_stride;
y += 8 ;
}
deltas[0 ] = hadd_four_32_neon(deltas[0 ], deltas[2 ], deltas[4 ], deltas[6 ]);
deltas[1 ] = hadd_four_32_neon(deltas[1 ], deltas[3 ], deltas[5 ], deltas[7 ]);
deltas[2 ] =
hadd_four_32_neon(deltas[8 ], deltas[10 ], deltas[12 ], deltas[12 ]);
deltas[3 ] =
hadd_four_32_neon(deltas[9 ], deltas[11 ], deltas[13 ], deltas[13 ]);
deltas[4 ] =
hadd_four_32_neon(deltas[14 ], deltas[16 ], deltas[18 ], deltas[20 ]);
deltas[5 ] =
hadd_four_32_neon(deltas[15 ], deltas[17 ], deltas[19 ], deltas[21 ]);
deltas[6 ] =
hadd_four_32_neon(deltas[22 ], deltas[24 ], deltas[22 ], deltas[24 ]);
deltas[7 ] =
hadd_four_32_neon(deltas[23 ], deltas[25 ], deltas[23 ], deltas[25 ]);
deltas[0 ] = vsubq_s32(deltas[1 ], deltas[0 ]);
deltas[1 ] = vsubq_s32(deltas[3 ], deltas[2 ]);
deltas[2 ] = vsubq_s32(deltas[5 ], deltas[4 ]);
deltas[3 ] = vsubq_s32(deltas[7 ], deltas[6 ]);
if (h8 != height) {
const int16_t ds0_vals[] = {
dj[0 * d_stride], dj[0 * d_stride + width],
dj[1 * d_stride], dj[1 * d_stride + width],
dj[2 * d_stride], dj[2 * d_stride + width],
dj[3 * d_stride], dj[3 * d_stride + width]
};
ds[0 ] = vld1q_s16(ds0_vals);
ds[1 ] = vsetq_lane_s16(dj[4 * d_stride], ds[1 ], 0 );
ds[1 ] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1 ], 1 );
ds[1 ] = vsetq_lane_s16(dj[5 * d_stride], ds[1 ], 2 );
ds[1 ] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1 ], 3 );
const int16_t dd4_vals[] = {
-di[1 * d_stride], di[1 * d_stride + width],
-di[2 * d_stride], di[2 * d_stride + width],
-di[3 * d_stride], di[3 * d_stride + width],
-di[4 * d_stride], di[4 * d_stride + width]
};
dd[4 ] = vld1q_s16(dd4_vals);
dd[5 ] = vsetq_lane_s16(-di[5 * d_stride], dd[5 ], 0 );
dd[5 ] = vsetq_lane_s16(di[5 * d_stride + width], dd[5 ], 1 );
do {
dd[0 ] = vdupq_n_s16(-di[0 * d_stride]);
dd[2 ] = dd[3 ] = vdupq_n_s16(di[0 * d_stride + width]);
dd[0 ] = dd[1 ] = vzipq_s16(dd[0 ], dd[2 ]).val[0 ];
ds[4 ] = vdupq_n_s16(dj[0 * d_stride]);
ds[6 ] = ds[7 ] = vdupq_n_s16(dj[0 * d_stride + width]);
ds[4 ] = ds[5 ] = vzipq_s16(ds[4 ], ds[6 ]).val[0 ];
dd[5 ] = vsetq_lane_s16(-di[6 * d_stride], dd[5 ], 2 );
dd[5 ] = vsetq_lane_s16(di[6 * d_stride + width], dd[5 ], 3 );
ds[1 ] = vsetq_lane_s16(dj[6 * d_stride], ds[1 ], 4 );
ds[1 ] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1 ], 5 );
madd_neon_pairwise(&deltas[0 ], dd[0 ], ds[0 ]);
madd_neon_pairwise(&deltas[1 ], dd[1 ], ds[1 ]);
madd_neon_pairwise(&deltas[2 ], dd[4 ], ds[4 ]);
madd_neon_pairwise(&deltas[3 ], dd[5 ], ds[5 ]);
int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0 ]), 0 );
ds[0 ] = vextq_s16(ds[0 ], ds[1 ], 2 );
ds[1 ] = vextq_s16(ds[1 ], ds[0 ], 2 );
ds[1 ] = vreinterpretq_s16_s32(
vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1 ]), 3 ));
int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4 ]), 0 );
dd[4 ] = vextq_s16(dd[4 ], dd[5 ], 2 );
dd[5 ] = vextq_s16(dd[5 ], dd[4 ], 2 );
dd[5 ] = vreinterpretq_s16_s32(
vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5 ]), 3 ));
di += d_stride;
dj += d_stride;
} while (++y < height);
}
// Writing one more element on the top edge of a square falls to
// the next square in the same row or the first element in the next
// row, which will just be overwritten later.
update_8_stats_neon(
H + (i - 1 ) * wiener_win * wiener_win2 + (j - 1 ) * wiener_win,
deltas[0 ], deltas[1 ],
H + i * wiener_win * wiener_win2 + j * wiener_win);
H[(i * wiener_win + 1 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 1 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(deltas[2 ], 0 );
H[(i * wiener_win + 2 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 2 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(deltas[2 ], 1 );
H[(i * wiener_win + 3 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 3 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(deltas[2 ], 2 );
H[(i * wiener_win + 4 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 4 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(deltas[2 ], 3 );
H[(i * wiener_win + 5 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 5 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(deltas[3 ], 0 );
H[(i * wiener_win + 6 ) * wiener_win2 + j * wiener_win] =
H[((i - 1 ) * wiener_win + 6 ) * wiener_win2 + (j - 1 ) * wiener_win] +
vgetq_lane_s32(deltas[3 ], 1 );
} while (++j < wiener_win);
} while (++i < wiener_win - 1 );
// Step 5: Derive other points of each square. No square in bottom row.
i = 0 ;
do {
const int16_t *const di = d + i;
j = i + 1 ;
do {
const int16_t *const dj = d + j;
int32x4_t deltas[WIENER_WIN - 1 ][WIN_7] = { { vdupq_n_s32(0 ) },
{ vdupq_n_s32(0 ) } };
int16x8_t d_is[WIN_7];
int16x8_t d_ie[WIN_7];
int16x8_t d_js[WIN_7];
int16x8_t d_je[WIN_7];
x = 0 ;
while (x < w16) {
load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
d_js, d_je);
derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas);
x += 16 ;
}
if (w16 != width) {
load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
d_js, d_je);
d_is[0 ] = vandq_s16(d_is[0 ], mask[0 ]);
d_is[1 ] = vandq_s16(d_is[1 ], mask[1 ]);
d_is[2 ] = vandq_s16(d_is[2 ], mask[0 ]);
d_is[3 ] = vandq_s16(d_is[3 ], mask[1 ]);
d_is[4 ] = vandq_s16(d_is[4 ], mask[0 ]);
d_is[5 ] = vandq_s16(d_is[5 ], mask[1 ]);
d_is[6 ] = vandq_s16(d_is[6 ], mask[0 ]);
d_is[7 ] = vandq_s16(d_is[7 ], mask[1 ]);
d_is[8 ] = vandq_s16(d_is[8 ], mask[0 ]);
d_is[9 ] = vandq_s16(d_is[9 ], mask[1 ]);
d_is[10 ] = vandq_s16(d_is[10 ], mask[0 ]);
d_is[11 ] = vandq_s16(d_is[11 ], mask[1 ]);
d_ie[0 ] = vandq_s16(d_ie[0 ], mask[0 ]);
d_ie[1 ] = vandq_s16(d_ie[1 ], mask[1 ]);
d_ie[2 ] = vandq_s16(d_ie[2 ], mask[0 ]);
d_ie[3 ] = vandq_s16(d_ie[3 ], mask[1 ]);
d_ie[4 ] = vandq_s16(d_ie[4 ], mask[0 ]);
d_ie[5 ] = vandq_s16(d_ie[5 ], mask[1 ]);
d_ie[6 ] = vandq_s16(d_ie[6 ], mask[0 ]);
d_ie[7 ] = vandq_s16(d_ie[7 ], mask[1 ]);
d_ie[8 ] = vandq_s16(d_ie[8 ], mask[0 ]);
d_ie[9 ] = vandq_s16(d_ie[9 ], mask[1 ]);
d_ie[10 ] = vandq_s16(d_ie[10 ], mask[0 ]);
d_ie[11 ] = vandq_s16(d_ie[11 ], mask[1 ]);
derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas);
}
hadd_update_6_stats_neon(
H + (i * wiener_win + 0 ) * wiener_win2 + j * wiener_win, deltas[0 ],
H + (i * wiener_win + 1 ) * wiener_win2 + j * wiener_win + 1 );
hadd_update_6_stats_neon(
H + (i * wiener_win + 1 ) * wiener_win2 + j * wiener_win, deltas[1 ],
H + (i * wiener_win + 2 ) * wiener_win2 + j * wiener_win + 1 );
hadd_update_6_stats_neon(
H + (i * wiener_win + 2 ) * wiener_win2 + j * wiener_win, deltas[2 ],
H + (i * wiener_win + 3 ) * wiener_win2 + j * wiener_win + 1 );
hadd_update_6_stats_neon(
H + (i * wiener_win + 3 ) * wiener_win2 + j * wiener_win, deltas[3 ],
H + (i * wiener_win + 4 ) * wiener_win2 + j * wiener_win + 1 );
hadd_update_6_stats_neon(
H + (i * wiener_win + 4 ) * wiener_win2 + j * wiener_win, deltas[4 ],
H + (i * wiener_win + 5 ) * wiener_win2 + j * wiener_win + 1 );
hadd_update_6_stats_neon(
H + (i * wiener_win + 5 ) * wiener_win2 + j * wiener_win, deltas[5 ],
H + (i * wiener_win + 6 ) * wiener_win2 + j * wiener_win + 1 );
} while (++j < wiener_win);
} while (++i < wiener_win - 1 );
// Step 6: Derive other points of each upper triangle along the diagonal.
i = 0 ;
do {
const int16_t *const di = d + i;
int32x4_t deltas[WIENER_WIN * (WIENER_WIN - 1 )] = { vdupq_n_s32(0 ) };
int16x8_t d_is[WIN_7], d_ie[WIN_7];
x = 0 ;
while (x < w16) {
load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
derive_triangle_win7_neon(d_is, d_ie, deltas);
x += 16 ;
}
if (w16 != width) {
load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
d_is[0 ] = vandq_s16(d_is[0 ], mask[0 ]);
d_is[1 ] = vandq_s16(d_is[1 ], mask[1 ]);
d_is[2 ] = vandq_s16(d_is[2 ], mask[0 ]);
d_is[3 ] = vandq_s16(d_is[3 ], mask[1 ]);
d_is[4 ] = vandq_s16(d_is[4 ], mask[0 ]);
d_is[5 ] = vandq_s16(d_is[5 ], mask[1 ]);
d_is[6 ] = vandq_s16(d_is[6 ], mask[0 ]);
d_is[7 ] = vandq_s16(d_is[7 ], mask[1 ]);
d_is[8 ] = vandq_s16(d_is[8 ], mask[0 ]);
d_is[9 ] = vandq_s16(d_is[9 ], mask[1 ]);
d_is[10 ] = vandq_s16(d_is[10 ], mask[0 ]);
d_is[11 ] = vandq_s16(d_is[11 ], mask[1 ]);
d_ie[0 ] = vandq_s16(d_ie[0 ], mask[0 ]);
d_ie[1 ] = vandq_s16(d_ie[1 ], mask[1 ]);
d_ie[2 ] = vandq_s16(d_ie[2 ], mask[0 ]);
d_ie[3 ] = vandq_s16(d_ie[3 ], mask[1 ]);
d_ie[4 ] = vandq_s16(d_ie[4 ], mask[0 ]);
d_ie[5 ] = vandq_s16(d_ie[5 ], mask[1 ]);
d_ie[6 ] = vandq_s16(d_ie[6 ], mask[0 ]);
d_ie[7 ] = vandq_s16(d_ie[7 ], mask[1 ]);
d_ie[8 ] = vandq_s16(d_ie[8 ], mask[0 ]);
d_ie[9 ] = vandq_s16(d_ie[9 ], mask[1 ]);
d_ie[10 ] = vandq_s16(d_ie[10 ], mask[0 ]);
d_ie[11 ] = vandq_s16(d_ie[11 ], mask[1 ]);
derive_triangle_win7_neon(d_is, d_ie, deltas);
}
// Row 1: 6 points
hadd_update_6_stats_neon(
H + (i * wiener_win + 0 ) * wiener_win2 + i * wiener_win, deltas,
H + (i * wiener_win + 1 ) * wiener_win2 + i * wiener_win + 1 );
// Row 2: 5 points
hadd_update_4_stats_neon(
H + (i * wiener_win + 1 ) * wiener_win2 + i * wiener_win + 1 , deltas + 6 ,
H + (i * wiener_win + 2 ) * wiener_win2 + i * wiener_win + 2 );
H[(i * wiener_win + 2 ) * wiener_win2 + i * wiener_win + 6 ] =
H[(i * wiener_win + 1 ) * wiener_win2 + i * wiener_win + 5 ] +
horizontal_long_add_s32x4(deltas[10 ]);
// Row 3: 4 points
hadd_update_4_stats_neon(
H + (i * wiener_win + 2 ) * wiener_win2 + i * wiener_win + 2 ,
deltas + 11 ,
H + (i * wiener_win + 3 ) * wiener_win2 + i * wiener_win + 3 );
// Row 4: 3 points
#if AOM_ARCH_AARCH64
int64x2_t delta15_s64 = vpaddlq_s32(deltas[15 ]);
int64x2_t delta16_s64 = vpaddlq_s32(deltas[16 ]);
int64x2_t delta1516 = vpaddq_s64(delta15_s64, delta16_s64);
int64x2_t h0 =
vld1q_s64(H + (i * wiener_win + 3 ) * wiener_win2 + i * wiener_win + 3 );
vst1q_s64(H + (i * wiener_win + 4 ) * wiener_win2 + i * wiener_win + 4 ,
vaddq_s64(h0, delta1516));
#else
H[(i * wiener_win + 4 ) * wiener_win2 + i * wiener_win + 4 + 0 ] =
H[(i * wiener_win + 3 ) * wiener_win2 + i * wiener_win + 3 + 0 ] +
horizontal_long_add_s32x4(deltas[15 ]);
H[(i * wiener_win + 4 ) * wiener_win2 + i * wiener_win + 4 + 1 ] =
H[(i * wiener_win + 3 ) * wiener_win2 + i * wiener_win + 3 + 1 ] +
horizontal_long_add_s32x4(deltas[16 ]);
#endif // AOM_ARCH_AARCH64
H[(i * wiener_win + 4 ) * wiener_win2 + i * wiener_win + 6 ] =
H[(i * wiener_win + 3 ) * wiener_win2 + i * wiener_win + 5 ] +
horizontal_long_add_s32x4(deltas[17 ]);
// Row 5: 2 points
int64x2_t delta18_s64 = vpaddlq_s32(deltas[18 ]);
int64x2_t delta19_s64 = vpaddlq_s32(deltas[19 ]);
#if AOM_ARCH_AARCH64
int64x2_t delta1819 = vpaddq_s64(delta18_s64, delta19_s64);
int64x2_t h1 =
vld1q_s64(H + (i * wiener_win + 4 ) * wiener_win2 + i * wiener_win + 4 );
vst1q_s64(H + (i * wiener_win + 5 ) * wiener_win2 + i * wiener_win + 5 ,
vaddq_s64(h1, delta1819));
#else
H[(i * wiener_win + 5 ) * wiener_win2 + i * wiener_win + 5 ] =
H[(i * wiener_win + 4 ) * wiener_win2 + i * wiener_win + 4 ] +
horizontal_add_s64x2(delta18_s64);
H[(i * wiener_win + 5 ) * wiener_win2 + i * wiener_win + 5 + 1 ] =
H[(i * wiener_win + 4 ) * wiener_win2 + i * wiener_win + 4 + 1 ] +
horizontal_add_s64x2(delta19_s64);
#endif // AOM_ARCH_AARCH64
// Row 6: 1 points
H[(i * wiener_win + 6 ) * wiener_win2 + i * wiener_win + 6 ] =
H[(i * wiener_win + 5 ) * wiener_win2 + i * wiener_win + 5 ] +
horizontal_long_add_s32x4(deltas[20 ]);
} while (++i < wiener_win);
}
static inline void sub_avg_block_highbd_neon(const uint16_t *src,
const int32_t src_stride,
const uint16_t avg,
const int32_t width,
const int32_t height, int16_t *dst,
const int32_t dst_stride) {
const uint16x8_t a = vdupq_n_u16(avg);
int32_t i = height + 1 ;
do {
int32_t j = 0 ;
while (j < width) {
const uint16x8_t s = vld1q_u16(src + j);
const uint16x8_t d = vsubq_u16(s, a);
vst1q_s16(dst + j, vreinterpretq_s16_u16(d));
j += 8 ;
}
src += src_stride;
dst += dst_stride;
} while (--i);
}
static inline uint16_t highbd_find_average_neon(const uint16_t *src,
int src_stride, int width,
int height) {
assert(width > 0 );
assert(height > 0 );
uint64x2_t sum_u64 = vdupq_n_u64(0 );
uint64_t sum = 0 ;
const uint16x8_t mask =
vreinterpretq_u16_s16(vld1q_s16(&mask_16bit[16 ] - (width % 8 )));
int h = height;
do {
uint32x4_t sum_u32[2 ] = { vdupq_n_u32(0 ), vdupq_n_u32(0 ) };
int w = width;
const uint16_t *row = src;
while (w >= 32 ) {
uint16x8_t s0 = vld1q_u16(row + 0 );
uint16x8_t s1 = vld1q_u16(row + 8 );
uint16x8_t s2 = vld1q_u16(row + 16 );
uint16x8_t s3 = vld1q_u16(row + 24 );
s0 = vaddq_u16(s0, s1);
s2 = vaddq_u16(s2, s3);
sum_u32[0 ] = vpadalq_u16(sum_u32[0 ], s0);
sum_u32[1 ] = vpadalq_u16(sum_u32[1 ], s2);
row += 32 ;
w -= 32 ;
}
if (w >= 16 ) {
uint16x8_t s0 = vld1q_u16(row + 0 );
uint16x8_t s1 = vld1q_u16(row + 8 );
s0 = vaddq_u16(s0, s1);
sum_u32[0 ] = vpadalq_u16(sum_u32[0 ], s0);
row += 16 ;
w -= 16 ;
}
if (w >= 8 ) {
uint16x8_t s0 = vld1q_u16(row);
sum_u32[1 ] = vpadalq_u16(sum_u32[1 ], s0);
row += 8 ;
w -= 8 ;
}
if (w) {
uint16x8_t s0 = vandq_u16(vld1q_u16(row), mask);
sum_u32[1 ] = vpadalq_u16(sum_u32[1 ], s0);
row += 8 ;
w -= 8 ;
}
sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0 ], sum_u32[1 ]));
src += src_stride;
} while (--h != 0 );
return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width));
}
void av1_compute_stats_highbd_neon(int32_t wiener_win, const uint8_t *dgd8,
const uint8_t *src8, int16_t *dgd_avg,
int16_t *src_avg, int32_t h_start,
int32_t h_end, int32_t v_start,
int32_t v_end, int32_t dgd_stride,
int32_t src_stride, int64_t *M, int64_t *H,
aom_bit_depth_t bit_depth) {
const int32_t wiener_win2 = wiener_win * wiener_win;
const int32_t wiener_halfwin = (wiener_win >> 1 );
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
const int32_t width = h_end - h_start;
const int32_t height = v_end - v_start;
const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride;
const uint16_t avg =
highbd_find_average_neon(dgd_start, dgd_stride, width, height);
const int32_t d_stride = (width + 2 * wiener_halfwin + 15 ) & ~15 ;
const int32_t s_stride = (width + 15 ) & ~15 ;
sub_avg_block_highbd_neon(src + v_start * src_stride + h_start, src_stride,
avg, width, height, src_avg, s_stride);
sub_avg_block_highbd_neon(
dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin,
dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin,
dgd_avg, d_stride);
if (wiener_win == WIENER_WIN) {
compute_stats_win7_highbd_neon(dgd_avg, d_stride, src_avg, s_stride, width,
height, M, H, bit_depth);
} else if (wiener_win == WIENER_WIN_CHROMA) {
compute_stats_win5_highbd_neon(dgd_avg, d_stride, src_avg, s_stride, width,
height, M, H, bit_depth);
}
// H is a symmetric matrix, so we only need to fill out the upper triangle.
// We can copy it down to the lower triangle outside the (i, j) loops.
if (bit_depth == AOM_BITS_8) {
diagonal_copy_stats_neon(wiener_win2, H);
} else if (bit_depth == AOM_BITS_10) { // bit_depth == AOM_BITS_10
const int32_t k4 = wiener_win2 & ~3 ;
int32_t k = 0 ;
do {
int64x2_t dst = div4_neon(vld1q_s64(M + k));
vst1q_s64(M + k, dst);
dst = div4_neon(vld1q_s64(M + k + 2 ));
vst1q_s64(M + k + 2 , dst);
H[k * wiener_win2 + k] /= 4 ;
k += 4 ;
} while (k < k4);
H[k * wiener_win2 + k] /= 4 ;
for (; k < wiener_win2; ++k) {
M[k] /= 4 ;
}
div4_diagonal_copy_stats_neon(wiener_win2, H);
} else { // bit_depth == AOM_BITS_12
const int32_t k4 = wiener_win2 & ~3 ;
int32_t k = 0 ;
do {
int64x2_t dst = div16_neon(vld1q_s64(M + k));
vst1q_s64(M + k, dst);
dst = div16_neon(vld1q_s64(M + k + 2 ));
vst1q_s64(M + k + 2 , dst);
H[k * wiener_win2 + k] /= 16 ;
k += 4 ;
} while (k < k4);
H[k * wiener_win2 + k] /= 16 ;
for (; k < wiener_win2; ++k) {
M[k] /= 16 ;
}
div16_diagonal_copy_stats_neon(wiener_win2, H);
}
}
int64_t av1_highbd_pixel_proj_error_neon(
const uint8_t *src8, int width, int height, int src_stride,
const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
int32_t *flt1, int flt1_stride, int xq[2 ], const sgr_params_type *params) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
int64_t sse = 0 ;
int64x2_t sse_s64 = vdupq_n_s64(0 );
if (params->r[0 ] > 0 && params->r[1 ] > 0 ) {
int32x2_t xq_v = vld1_s32(xq);
int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4 );
do {
int j = 0 ;
int32x4_t sse_s32 = vdupq_n_s32(0 );
do {
const uint16x8_t d = vld1q_u16(&dat[j]);
const uint16x8_t s = vld1q_u16(&src[j]);
int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4 ]);
int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4 ]);
int32x4_t d_s32_lo = vreinterpretq_s32_u32(
vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0 ));
int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16(
vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0 ));
int32x4_t v0 = vsubq_s32(
vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1 )),
d_s32_lo);
int32x4_t v1 = vsubq_s32(
vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1 )),
d_s32_hi);
v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0 );
v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0 );
v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1 );
v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1 );
int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
vreinterpretq_s16_u16(vsubq_u16(d, s)));
int16x4_t e_lo = vget_low_s16(e);
int16x4_t e_hi = vget_high_s16(e);
sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
j += 8 ;
} while (j <= width - 8 );
for (int k = j; k < width; ++k) {
int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1 );
v += xq[0 ] * (flt0[k]) + xq[1 ] * (flt1[k]);
v -= (xq[1 ] + xq[0 ]) * (int32_t)(dat[k] << 4 );
int32_t e =
(v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
sse += ((int64_t)e * e);
}
sse_s64 = vpadalq_s32(sse_s64, sse_s32);
dat += dat_stride;
src += src_stride;
flt0 += flt0_stride;
flt1 += flt1_stride;
} while (--height != 0 );
} else if (params->r[0 ] > 0 || params->r[1 ] > 0 ) {
int xq_active = (params->r[0 ] > 0 ) ? xq[0 ] : xq[1 ];
int32_t *flt = (params->r[0 ] > 0 ) ? flt0 : flt1;
int flt_stride = (params->r[0 ] > 0 ) ? flt0_stride : flt1_stride;
int32x4_t xq_v = vdupq_n_s32(xq_active);
do {
int j = 0 ;
int32x4_t sse_s32 = vdupq_n_s32(0 );
do {
const uint16x8_t d0 = vld1q_u16(&dat[j]);
const uint16x8_t s0 = vld1q_u16(&src[j]);
int32x4_t flt0_0 = vld1q_s32(&flt[j]);
int32x4_t flt0_1 = vld1q_s32(&flt[j + 4 ]);
uint16x8_t d_u16 = vshlq_n_u16(d0, 4 );
int32x4_t sub0 = vreinterpretq_s32_u32(
vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16)));
int32x4_t sub1 = vreinterpretq_s32_u32(
vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16)));
int32x4_t v0 = vmlaq_s32(
vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1 )), sub0,
xq_v);
int32x4_t v1 = vmlaq_s32(
vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1 )), sub1,
xq_v);
int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
vreinterpretq_s16_u16(vsubq_u16(d0, s0)));
int16x4_t e_lo = vget_low_s16(e);
int16x4_t e_hi = vget_high_s16(e);
sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
j += 8 ;
} while (j <= width - 8 );
for (int k = j; k < width; ++k) {
int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1 );
v += xq_active * (int32_t)((uint32_t)flt[j] - (uint16_t)(dat[k] << 4 ));
const int32_t e =
(v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
sse += ((int64_t)e * e);
}
sse_s64 = vpadalq_s32(sse_s64, sse_s32);
dat += dat_stride;
flt += flt_stride;
src += src_stride;
} while (--height != 0 );
} else {
do {
int j = 0 ;
do {
const uint16x8_t d = vld1q_u16(&dat[j]);
const uint16x8_t s = vld1q_u16(&src[j]);
uint16x8_t diff = vabdq_u16(d, s);
uint16x4_t diff_lo = vget_low_u16(diff);
uint16x4_t diff_hi = vget_high_u16(diff);
uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo);
uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi);
sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo));
sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi));
j += 8 ;
} while (j <= width - 8 );
for (int k = j; k < width; ++k) {
int32_t e = dat[k] - src[k];
sse += e * e;
}
dat += dat_stride;
src += src_stride;
} while (--height != 0 );
}
sse += horizontal_add_s64x2(sse_s64);
return sse;
}
Messung V0.5 in Prozent C=92 H=84 G=87
¤ Dauer der Verarbeitung: 0.32 Sekunden
(vorverarbeitet am 2026-06-06)
¤
*© Formatika GbR, Deutschland