/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <stdint.h>
#include <stdlib.h>
#include "common/attributes.h"
#include "common/bitdepth.h"
#include "common/intops.h"
#include "src/looprestoration.h"
#include "src/tables.h"
// 256 * 1.5 + 3 + 3 = 390
#define REST_UNIT_STRIDE (390 )
static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4 ],
const pixel *src, const int16_t fh[8 ],
const int w, const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
{
const int bitdepth = bitdepth_from_max(bitdepth_max);
const int round_bits_h = 3 + (bitdepth == 12 ) * 2 ;
const int rounding_off_h = 1 << (round_bits_h - 1 );
const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
if (w < 6 ) {
// For small widths, do the fully conditional loop with
// conditions on each access.
for (int x = 0 ; x < w; x++) {
int sum = (1 << (bitdepth + 6 ));
#if BITDEPTH == 8
sum += src[x] * 128 ;
#endif
for (int i = 0 ; i < 7 ; i++) {
int idx = x + i - 3 ;
if (idx < 0 ) {
if (!(edges & LR_HAVE_LEFT))
sum += src[0 ] * fh[i];
else if (left)
sum += left[0 ][4 + idx] * fh[i];
else
sum += src[idx] * fh[i];
} else if (idx >= w && !(edges & LR_HAVE_RIGHT)) {
sum += src[w - 1 ] * fh[i];
} else
sum += src[idx] * fh[i];
}
sum = iclip((sum + rounding_off_h) >> round_bits_h, 0 , clip_limit - 1 );
dst[x] = sum;
}
return ;
}
// For larger widths, do separate loops with less conditions; first
// handle the start of the row.
int start = 3 ;
if (!(edges & LR_HAVE_LEFT)) {
// If there's no left edge, pad using the leftmost pixel.
for (int x = 0 ; x < 3 ; x++) {
int sum = (1 << (bitdepth + 6 ));
#if BITDEPTH == 8
sum += src[x] * 128 ;
#endif
for (int i = 0 ; i < 7 ; i++) {
int idx = x + i - 3 ;
if (idx < 0 )
sum += src[0 ] * fh[i];
else
sum += src[idx] * fh[i];
}
sum = iclip((sum + rounding_off_h) >> round_bits_h, 0 , clip_limit - 1 );
dst[x] = sum;
}
} else if (left) {
// If we have the left edge and a separate left buffer, pad using that.
for (int x = 0 ; x < 3 ; x++) {
int sum = (1 << (bitdepth + 6 ));
#if BITDEPTH == 8
sum += src[x] * 128 ;
#endif
for (int i = 0 ; i < 7 ; i++) {
int idx = x + i - 3 ;
if (idx < 0 )
sum += left[0 ][4 + idx] * fh[i];
else
sum += src[idx] * fh[i];
}
sum = iclip((sum + rounding_off_h) >> round_bits_h, 0 , clip_limit - 1 );
dst[x] = sum;
}
} else {
// If we have the left edge, but no separate left buffer, we're in the
// top/bottom area (lpf) with the left edge existing in the same
// buffer; just do the regular loop from the start.
start = 0 ;
}
int end = w - 3 ;
if (edges & LR_HAVE_RIGHT)
end = w;
// Do a condititon free loop for the bulk of the row.
for (int x = start; x < end; x++) {
int sum = (1 << (bitdepth + 6 ));
#if BITDEPTH == 8
sum += src[x] * 128 ;
#endif
for (int i = 0 ; i < 7 ; i++) {
int idx = x + i - 3 ;
sum += src[idx] * fh[i];
}
sum = iclip((sum + rounding_off_h) >> round_bits_h, 0 , clip_limit - 1 );
dst[x] = sum;
}
// If we need to, calculate the end of the row with a condition for
// right edge padding.
for (int x = end; x < w; x++) {
int sum = (1 << (bitdepth + 6 ));
#if BITDEPTH == 8
sum += src[x] * 128 ;
#endif
for (int i = 0 ; i < 7 ; i++) {
int idx = x + i - 3 ;
if (idx >= w)
sum += src[w - 1 ] * fh[i];
else
sum += src[idx] * fh[i];
}
sum = iclip((sum + rounding_off_h) >> round_bits_h, 0 , clip_limit - 1 );
dst[x] = sum;
}
}
static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8 ],
const int w HIGHBD_DECL_SUFFIX)
{
const int bitdepth = bitdepth_from_max(bitdepth_max);
const int round_bits_v = 11 - (bitdepth == 12 ) * 2 ;
const int rounding_off_v = 1 << (round_bits_v - 1 );
const int round_offset = 1 << (bitdepth + (round_bits_v - 1 ));
for (int i = 0 ; i < w; i++) {
int sum = -round_offset;
// Only filter using 6 input rows. The 7th row is assumed to be
// identical to the last one.
//
// This function is assumed to only be called at the end, when doing
// padding at the bottom.
for (int k = 0 ; k < 6 ; k++)
sum += ptrs[k][i] * fv[k];
sum += ptrs[5 ][i] * fv[6 ];
p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
}
// Shift the pointers, but only update the first 5; the 6th pointer is kept
// as it was before (and the 7th is implicitly identical to the 6th).
for (int i = 0 ; i < 5 ; i++)
ptrs[i] = ptrs[i + 1 ];
}
static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4 ],
const pixel *src, const int16_t filter[2 ][8 ],
const int w, const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
{
const int bitdepth = bitdepth_from_max(bitdepth_max);
const int round_bits_v = 11 - (bitdepth == 12 ) * 2 ;
const int rounding_off_v = 1 << (round_bits_v - 1 );
const int round_offset = 1 << (bitdepth + (round_bits_v - 1 ));
const int16_t *fh = filter[0 ];
const int16_t *fv = filter[1 ];
// Do combined horziontal and vertical filtering; doing horizontal
// filtering of one row, combined with vertical filtering of 6
// preexisting rows and the newly filtered row.
// For simplicity in the C implementation, just do a separate call
// of the horizontal filter, into a temporary buffer.
uint16_t tmp[REST_UNIT_STRIDE];
wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
for (int i = 0 ; i < w; i++) {
int sum = -round_offset;
// Filter using the 6 stored preexisting rows, and the newly
// filtered one in tmp[].
for (int k = 0 ; k < 6 ; k++)
sum += ptrs[k][i] * fv[k];
sum += tmp[i] * fv[6 ];
// At this point, after having read all inputs at point [i], we
// could overwrite [i] with the newly filtered data.
p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
}
// For simplicity in the C implementation, just memcpy the newly
// filtered row into ptrs[6]. Normally, in steady state filtering,
// this output row, ptrs[6], is equal to ptrs[0]. However at startup,
// at the top of the filtered area, we may have ptrs[0] equal to ptrs[1],
// so we can't assume we can write into ptrs[0] but we need to keep
// a separate pointer for the next row to write into.
memcpy(ptrs[6 ], tmp, sizeof (uint16_t) * REST_UNIT_STRIDE);
// Rotate the window of pointers. Shift the 6 pointers downwards one step.
for (int i = 0 ; i < 6 ; i++)
ptrs[i] = ptrs[i + 1 ];
// The topmost pointer, ptrs[6], which isn't used as input, is set to
// ptrs[0], which will be used as output for the next _hv call.
// At the start of the filtering, the caller may set ptrs[6] to the
// right next buffer to fill in, instead.
ptrs[6 ] = ptrs[0 ];
}
// FIXME Could split into luma and chroma specific functions,
// (since first and last tops are always 0 for chroma)
static void wiener_c(pixel *p, const ptrdiff_t stride,
const pixel (*left)[4 ],
const pixel *lpf, const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
// Values stored between horizontal and vertical filtering don't
// fit in a uint8_t.
uint16_t hor[6 * REST_UNIT_STRIDE];
uint16_t *ptrs[7 ], *rows[6 ];
for (int i = 0 ; i < 6 ; i++)
rows[i] = &hor[i * REST_UNIT_STRIDE];
const int16_t (*const filter)[8 ] = params->filter;
const int16_t *fh = params->filter[0 ];
const int16_t *fv = params->filter[1 ];
const pixel *lpf_bottom = lpf + 6 *PXSTRIDE(stride);
const pixel *src = p;
if (edges & LR_HAVE_TOP) {
ptrs[0 ] = rows[0 ];
ptrs[1 ] = rows[0 ];
ptrs[2 ] = rows[1 ];
ptrs[3 ] = rows[2 ];
ptrs[4 ] = rows[2 ];
ptrs[5 ] = rows[2 ];
wiener_filter_h(rows[0 ], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
lpf += PXSTRIDE(stride);
wiener_filter_h(rows[1 ], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
wiener_filter_h(rows[2 ], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto v1;
ptrs[4 ] = ptrs[5 ] = rows[3 ];
wiener_filter_h(rows[3 ], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto v2;
ptrs[5 ] = rows[4 ];
wiener_filter_h(rows[4 ], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto v3;
} else {
ptrs[0 ] = rows[0 ];
ptrs[1 ] = rows[0 ];
ptrs[2 ] = rows[0 ];
ptrs[3 ] = rows[0 ];
ptrs[4 ] = rows[0 ];
ptrs[5 ] = rows[0 ];
wiener_filter_h(rows[0 ], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto v1;
ptrs[4 ] = ptrs[5 ] = rows[1 ];
wiener_filter_h(rows[1 ], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto v2;
ptrs[5 ] = rows[2 ];
wiener_filter_h(rows[2 ], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto v3;
ptrs[6 ] = rows[3 ];
wiener_filter_hv(p, ptrs, left, src, filter, w, edges
HIGHBD_TAIL_SUFFIX);
left++;
src += PXSTRIDE(stride);
p += PXSTRIDE(stride);
if (--h <= 0 )
goto v3;
ptrs[6 ] = rows[4 ];
wiener_filter_hv(p, ptrs, left, src, filter, w, edges
HIGHBD_TAIL_SUFFIX);
left++;
src += PXSTRIDE(stride);
p += PXSTRIDE(stride);
if (--h <= 0 )
goto v3;
}
ptrs[6 ] = ptrs[5 ] + REST_UNIT_STRIDE;
do {
wiener_filter_hv(p, ptrs, left, src, filter, w, edges
HIGHBD_TAIL_SUFFIX);
left++;
src += PXSTRIDE(stride);
p += PXSTRIDE(stride);
} while (--h > 0 );
if (!(edges & LR_HAVE_BOTTOM))
goto v3;
wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
HIGHBD_TAIL_SUFFIX);
lpf_bottom += PXSTRIDE(stride);
p += PXSTRIDE(stride);
wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
HIGHBD_TAIL_SUFFIX);
p += PXSTRIDE(stride);
v1:
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
return ;
v3:
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
p += PXSTRIDE(stride);
v2:
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
p += PXSTRIDE(stride);
goto v1;
}
// SGR
static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n)
{
int32_t *tmp32 = sumsq_ptrs[0 ];
coef *tmpc = sum_ptrs[0 ];
for (int i = 0 ; i < n - 1 ; i++) {
sumsq_ptrs[i] = sumsq_ptrs[i + 1 ];
sum_ptrs[i] = sum_ptrs[i + 1 ];
}
sumsq_ptrs[n - 1 ] = tmp32;
sum_ptrs[n - 1 ] = tmpc;
}
static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs)
{
int32_t *tmp32[2 ];
coef *tmpc[2 ];
for (int i = 0 ; i < 2 ; i++) {
tmp32[i] = sumsq_ptrs[i];
tmpc[i] = sum_ptrs[i];
}
for (int i = 0 ; i < 3 ; i++) {
sumsq_ptrs[i] = sumsq_ptrs[i + 2 ];
sum_ptrs[i] = sum_ptrs[i + 2 ];
}
for (int i = 0 ; i < 2 ; i++) {
sumsq_ptrs[3 + i] = tmp32[i];
sum_ptrs[3 + i] = tmpc[i];
}
}
static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum,
const pixel (*left)[4 ],
const pixel *src, const int w,
const enum LrEdgeFlags edges)
{
sumsq++;
sum++;
int a = edges & LR_HAVE_LEFT ? (left ? left[0 ][2 ] : src[-2 ]) : src[0 ];
int b = edges & LR_HAVE_LEFT ? (left ? left[0 ][3 ] : src[-1 ]) : src[0 ];
for (int x = -1 ; x < w + 1 ; x++) {
int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1 ] : src[w - 1 ];
sum[x] = a + b + c;
sumsq[x] = a * a + b * b + c * c;
a = b;
b = c;
}
}
static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum,
const pixel (*left)[4 ],
const pixel *src, const int w,
const enum LrEdgeFlags edges)
{
sumsq++;
sum++;
int a = edges & LR_HAVE_LEFT ? (left ? left[0 ][1 ] : src[-3 ]) : src[0 ];
int b = edges & LR_HAVE_LEFT ? (left ? left[0 ][2 ] : src[-2 ]) : src[0 ];
int c = edges & LR_HAVE_LEFT ? (left ? left[0 ][3 ] : src[-1 ]) : src[0 ];
int d = src[0 ];
for (int x = -1 ; x < w + 1 ; x++) {
int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2 ] : src[w - 1 ];
sum[x] = a + b + c + d + e;
sumsq[x] = a * a + b * b + c * c + d * d + e * e;
a = b;
b = c;
c = d;
d = e;
}
}
static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3,
int32_t *sumsq5, coef *sum5,
const pixel (*left)[4 ],
const pixel *src, const int w,
const enum LrEdgeFlags edges)
{
sgr_box3_row_h(sumsq3, sum3, left, src, w, edges);
sgr_box5_row_h(sumsq5, sum5, left, src, w, edges);
}
static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum,
int32_t *sumsq_out, coef *sum_out,
const int w)
{
for (int x = 0 ; x < w + 2 ; x++) {
int sq_a = sumsq[0 ][x];
int sq_b = sumsq[1 ][x];
int sq_c = sumsq[2 ][x];
int s_a = sum[0 ][x];
int s_b = sum[1 ][x];
int s_c = sum[2 ][x];
sumsq_out[x] = sq_a + sq_b + sq_c;
sum_out[x] = s_a + s_b + s_c;
}
}
static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum,
int32_t *sumsq_out, coef *sum_out,
const int w)
{
for (int x = 0 ; x < w + 2 ; x++) {
int sq_a = sumsq[0 ][x];
int sq_b = sumsq[1 ][x];
int sq_c = sumsq[2 ][x];
int sq_d = sumsq[3 ][x];
int sq_e = sumsq[4 ][x];
int s_a = sum[0 ][x];
int s_b = sum[1 ][x];
int s_c = sum[2 ][x];
int s_d = sum[3 ][x];
int s_e = sum[4 ][x];
sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e;
sum_out[x] = s_a + s_b + s_c + s_d + s_e;
}
}
static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s,
int bitdepth_max, int n, int sgr_one_by_x)
{
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8 ;
for (int i = 0 ; i < w + 2 ; i++) {
const int a =
(AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1 )) >> (2 * bitdepth_min_8);
const int b =
(BB[i] + ((1 << bitdepth_min_8) >> 1 )) >> bitdepth_min_8;
const unsigned p = imax(a * n - b * b, 0 );
const unsigned z = (p * s + (1 << 19 )) >> 20 ;
const unsigned x = dav1d_sgr_x_by_x[umin(z, 255 )];
// This is where we invert A and B, so that B is of size coef.
AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11 )) >> 12 ;
BB[i] = x;
}
}
static void sgr_box3_vert(int32_t **sumsq, coef **sum,
int32_t *sumsq_out, coef *sum_out,
const int w, const int s, const int bitdepth_max)
{
sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w);
sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9 , 455 );
rotate(sumsq, sum, 3 );
}
static void sgr_box5_vert(int32_t **sumsq, coef **sum,
int32_t *sumsq_out, coef *sum_out,
const int w, const int s, const int bitdepth_max)
{
sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w);
sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25 , 164 );
rotate5_x2(sumsq, sum);
}
static void sgr_box3_hv(int32_t **sumsq, coef **sum,
int32_t *AA, coef *BB,
const pixel (*left)[4 ],
const pixel *src, const int w,
const int s,
const enum LrEdgeFlags edges,
const int bitdepth_max)
{
sgr_box3_row_h(sumsq[2 ], sum[2 ], left, src, w, edges);
sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max);
}
static NOINLINE void sgr_finish_filter_row1(coef *tmp,
const pixel *src,
int32_t **A_ptrs, coef **B_ptrs,
const int w)
{
#define EIGHT_NEIGHBORS(P, i)\
((P[1 ][i] + P[1 ][i - 1 ] + P[1 ][i + 1 ] + P[0 ][i] + P[2 ][i]) * 4 + \
(P[0 ][i - 1 ] + P[2 ][i - 1 ] + \
P[0 ][i + 1 ] + P[2 ][i + 1 ]) * 3 )
for (int i = 0 ; i < w; i++) {
const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1 );
const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1 );
tmp[i] = (b - a * src[i] + (1 << 8 )) >> 9 ;
}
#undef EIGHT_NEIGHBORS
}
#define FILTER_OUT_STRIDE (384 )
static NOINLINE void sgr_finish_filter2(coef *tmp,
const pixel *src,
const ptrdiff_t src_stride,
int32_t **A_ptrs, coef **B_ptrs,
const int w, const int h)
{
#define SIX_NEIGHBORS(P, i)\
((P[0 ][i] + P[1 ][i]) * 6 + \
(P[0 ][i - 1 ] + P[1 ][i - 1 ] + \
P[0 ][i + 1 ] + P[1 ][i + 1 ]) * 5 )
for (int i = 0 ; i < w; i++) {
const int a = SIX_NEIGHBORS(B_ptrs, i + 1 );
const int b = SIX_NEIGHBORS(A_ptrs, i + 1 );
tmp[i] = (b - a * src[i] + (1 << 8 )) >> 9 ;
}
if (h <= 1 )
return ;
tmp += FILTER_OUT_STRIDE;
src += PXSTRIDE(src_stride);
const int32_t *A = &A_ptrs[1 ][1 ];
const coef *B = &B_ptrs[1 ][1 ];
for (int i = 0 ; i < w; i++) {
const int a = B[i] * 6 + (B[i - 1 ] + B[i + 1 ]) * 5 ;
const int b = A[i] * 6 + (A[i - 1 ] + A[i + 1 ]) * 5 ;
tmp[i] = (b - a * src[i] + (1 << 7 )) >> 8 ;
}
#undef SIX_NEIGHBORS
}
static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1,
const int w, const int w1 HIGHBD_DECL_SUFFIX)
{
for (int i = 0 ; i < w; i++) {
const int v = w1 * t1[i];
dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10 )) >> 11 ));
}
}
static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride,
const coef *t1, const coef *t2,
const int w, const int h,
const int w0, const int w1 HIGHBD_DECL_SUFFIX)
{
for (int j = 0 ; j < h; j++) {
for (int i = 0 ; i < w; i++) {
const int v = w0 * t1[i] + w1 * t2[i];
dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10 )) >> 11 ));
}
dst += PXSTRIDE(dst_stride);
t1 += FILTER_OUT_STRIDE;
t2 += FILTER_OUT_STRIDE;
}
}
static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride,
int32_t **A_ptrs, coef **B_ptrs, const int w,
const int w1 HIGHBD_DECL_SUFFIX)
{
// Only one single row, no stride needed
ALIGN_STK_16(coef, tmp, 384 ,);
sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w);
sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
*dst += PXSTRIDE(stride);
rotate(A_ptrs, B_ptrs, 3 );
}
static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride,
int32_t **A_ptrs, coef **B_ptrs,
const int w, const int h, const int w1
HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(coef, tmp, 2 *FILTER_OUT_STRIDE,);
sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h);
sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
*dst += PXSTRIDE(stride);
if (h > 1 ) {
sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX);
*dst += PXSTRIDE(stride);
}
rotate(A_ptrs, B_ptrs, 2 );
}
static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride,
int32_t **A5_ptrs, coef **B5_ptrs,
int32_t **A3_ptrs, coef **B3_ptrs,
const int w, const int h,
const int w0, const int w1 HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(coef, tmp5, 2 *FILTER_OUT_STRIDE,);
ALIGN_STK_16(coef, tmp3, 2 *FILTER_OUT_STRIDE,);
sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h);
sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w);
if (h > 1 )
sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride),
&A3_ptrs[1 ], &B3_ptrs[1 ], w);
sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX);
*dst += h*PXSTRIDE(stride);
rotate(A5_ptrs, B5_ptrs, 2 );
rotate(A3_ptrs, B3_ptrs, 4 );
}
static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride,
const pixel (*left)[4 ], const pixel *lpf,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
#define BUF_STRIDE (384 + 16 )
ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16 ,);
ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16 ,);
int32_t *sumsq_ptrs[3 ], *sumsq_rows[3 ];
coef *sum_ptrs[3 ], *sum_rows[3 ];
for (int i = 0 ; i < 3 ; i++) {
sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
sum_rows[i] = &sum_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16 ,);
ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16 ,);
int32_t *A_ptrs[3 ];
coef *B_ptrs[3 ];
for (int i = 0 ; i < 3 ; i++) {
A_ptrs[i] = &A_buf[i * BUF_STRIDE];
B_ptrs[i] = &B_buf[i * BUF_STRIDE];
}
const pixel *src = dst;
const pixel *lpf_bottom = lpf + 6 *PXSTRIDE(stride);
if (edges & LR_HAVE_TOP) {
sumsq_ptrs[0 ] = sumsq_rows[0 ];
sumsq_ptrs[1 ] = sumsq_rows[1 ];
sumsq_ptrs[2 ] = sumsq_rows[2 ];
sum_ptrs[0 ] = sum_rows[0 ];
sum_ptrs[1 ] = sum_rows[1 ];
sum_ptrs[2 ] = sum_rows[2 ];
sgr_box3_row_h(sumsq_rows[0 ], sum_rows[0 ], NULL, lpf, w, edges);
lpf += PXSTRIDE(stride);
sgr_box3_row_h(sumsq_rows[1 ], sum_rows[1 ], NULL, lpf, w, edges);
sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
left++;
src += PXSTRIDE(stride);
rotate(A_ptrs, B_ptrs, 3 );
if (--h <= 0 )
goto vert_1;
sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
left++;
src += PXSTRIDE(stride);
rotate(A_ptrs, B_ptrs, 3 );
if (--h <= 0 )
goto vert_2;
} else {
sumsq_ptrs[0 ] = sumsq_rows[0 ];
sumsq_ptrs[1 ] = sumsq_rows[0 ];
sumsq_ptrs[2 ] = sumsq_rows[0 ];
sum_ptrs[0 ] = sum_rows[0 ];
sum_ptrs[1 ] = sum_rows[0 ];
sum_ptrs[2 ] = sum_rows[0 ];
sgr_box3_row_h(sumsq_rows[0 ], sum_rows[0 ], left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A_ptrs, B_ptrs, 3 );
if (--h <= 0 )
goto vert_1;
sumsq_ptrs[2 ] = sumsq_rows[1 ];
sum_ptrs[2 ] = sum_rows[1 ];
sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
left++;
src += PXSTRIDE(stride);
rotate(A_ptrs, B_ptrs, 3 );
if (--h <= 0 )
goto vert_2;
sumsq_ptrs[2 ] = sumsq_rows[2 ];
sum_ptrs[2 ] = sum_rows[2 ];
}
do {
sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
left++;
src += PXSTRIDE(stride);
sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
} while (--h > 0 );
if (!(edges & LR_HAVE_BOTTOM))
goto vert_2;
sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
lpf_bottom += PXSTRIDE(stride);
sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
return ;
vert_2:
sumsq_ptrs[2 ] = sumsq_ptrs[1 ];
sum_ptrs[2 ] = sum_ptrs[1 ];
sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
output_1:
sumsq_ptrs[2 ] = sumsq_ptrs[1 ];
sum_ptrs[2 ] = sum_ptrs[1 ];
sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
return ;
vert_1:
sumsq_ptrs[2 ] = sumsq_ptrs[1 ];
sum_ptrs[2 ] = sum_ptrs[1 ];
sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2 ], B_ptrs[2 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A_ptrs, B_ptrs, 3 );
goto output_1;
}
static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride,
const pixel (*left)[4 ], const pixel *lpf,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16 ,);
ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16 ,);
int32_t *sumsq_ptrs[5 ], *sumsq_rows[5 ];
coef *sum_ptrs[5 ], *sum_rows[5 ];
for (int i = 0 ; i < 5 ; i++) {
sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
sum_rows[i] = &sum_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16 ,);
ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16 ,);
int32_t *A_ptrs[2 ];
coef *B_ptrs[2 ];
for (int i = 0 ; i < 2 ; i++) {
A_ptrs[i] = &A_buf[i * BUF_STRIDE];
B_ptrs[i] = &B_buf[i * BUF_STRIDE];
}
const pixel *src = dst;
const pixel *lpf_bottom = lpf + 6 *PXSTRIDE(stride);
if (edges & LR_HAVE_TOP) {
sumsq_ptrs[0 ] = sumsq_rows[0 ];
sumsq_ptrs[1 ] = sumsq_rows[0 ];
sumsq_ptrs[2 ] = sumsq_rows[1 ];
sumsq_ptrs[3 ] = sumsq_rows[2 ];
sumsq_ptrs[4 ] = sumsq_rows[3 ];
sum_ptrs[0 ] = sum_rows[0 ];
sum_ptrs[1 ] = sum_rows[0 ];
sum_ptrs[2 ] = sum_rows[1 ];
sum_ptrs[3 ] = sum_rows[2 ];
sum_ptrs[4 ] = sum_rows[3 ];
sgr_box5_row_h(sumsq_rows[0 ], sum_rows[0 ], NULL, lpf, w, edges);
lpf += PXSTRIDE(stride);
sgr_box5_row_h(sumsq_rows[1 ], sum_rows[1 ], NULL, lpf, w, edges);
sgr_box5_row_h(sumsq_rows[2 ], sum_rows[2 ], left, src, w, edges);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto vert_1;
sgr_box5_row_h(sumsq_rows[3 ], sum_rows[3 ], left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1 ], B_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
rotate(A_ptrs, B_ptrs, 2 );
if (--h <= 0 )
goto vert_2;
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
// one of them to point at the previously unused rows[4].
sumsq_ptrs[3 ] = sumsq_rows[4 ];
sum_ptrs[3 ] = sum_rows[4 ];
} else {
sumsq_ptrs[0 ] = sumsq_rows[0 ];
sumsq_ptrs[1 ] = sumsq_rows[0 ];
sumsq_ptrs[2 ] = sumsq_rows[0 ];
sumsq_ptrs[3 ] = sumsq_rows[0 ];
sumsq_ptrs[4 ] = sumsq_rows[0 ];
sum_ptrs[0 ] = sum_rows[0 ];
sum_ptrs[1 ] = sum_rows[0 ];
sum_ptrs[2 ] = sum_rows[0 ];
sum_ptrs[3 ] = sum_rows[0 ];
sum_ptrs[4 ] = sum_rows[0 ];
sgr_box5_row_h(sumsq_rows[0 ], sum_rows[0 ], left, src, w, edges);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto vert_1;
sumsq_ptrs[4 ] = sumsq_rows[1 ];
sum_ptrs[4 ] = sum_rows[1 ];
sgr_box5_row_h(sumsq_rows[1 ], sum_rows[1 ], left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1 ], B_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
rotate(A_ptrs, B_ptrs, 2 );
if (--h <= 0 )
goto vert_2;
sumsq_ptrs[3 ] = sumsq_rows[2 ];
sumsq_ptrs[4 ] = sumsq_rows[3 ];
sum_ptrs[3 ] = sum_rows[2 ];
sum_ptrs[4 ] = sum_rows[3 ];
sgr_box5_row_h(sumsq_rows[2 ], sum_rows[2 ], left, src, w, edges);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto odd;
sgr_box5_row_h(sumsq_rows[3 ], sum_rows[3 ], left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1 ], B_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
w, 2 , params->sgr.w0 HIGHBD_TAIL_SUFFIX);
if (--h <= 0 )
goto vert_2;
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
// one of them to point at the previously unused rows[4].
sumsq_ptrs[3 ] = sumsq_rows[4 ];
sum_ptrs[3 ] = sum_rows[4 ];
}
do {
sgr_box5_row_h(sumsq_ptrs[3 ], sum_ptrs[3 ], left, src, w, edges);
left++;
src += PXSTRIDE(stride);
if (--h <= 0 )
goto odd;
sgr_box5_row_h(sumsq_ptrs[4 ], sum_ptrs[4 ], left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1 ], B_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
w, 2 , params->sgr.w0 HIGHBD_TAIL_SUFFIX);
} while (--h > 0 );
if (!(edges & LR_HAVE_BOTTOM))
goto vert_2;
sgr_box5_row_h(sumsq_ptrs[3 ], sum_ptrs[3 ], NULL, lpf_bottom, w, edges);
lpf_bottom += PXSTRIDE(stride);
sgr_box5_row_h(sumsq_ptrs[4 ], sum_ptrs[4 ], NULL, lpf_bottom, w, edges);
output_2:
sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1 ], B_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
w, 2 , params->sgr.w0 HIGHBD_TAIL_SUFFIX);
return ;
vert_2:
// Duplicate the last row twice more
sumsq_ptrs[3 ] = sumsq_ptrs[2 ];
sumsq_ptrs[4 ] = sumsq_ptrs[2 ];
sum_ptrs[3 ] = sum_ptrs[2 ];
sum_ptrs[4 ] = sum_ptrs[2 ];
goto output_2;
odd:
// Copy the last row as padding once
sumsq_ptrs[4 ] = sumsq_ptrs[3 ];
sum_ptrs[4 ] = sum_ptrs[3 ];
sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1 ], B_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
w, 2 , params->sgr.w0 HIGHBD_TAIL_SUFFIX);
output_1:
// Duplicate the last row twice more
sumsq_ptrs[3 ] = sumsq_ptrs[2 ];
sumsq_ptrs[4 ] = sumsq_ptrs[2 ];
sum_ptrs[3 ] = sum_ptrs[2 ];
sum_ptrs[4 ] = sum_ptrs[2 ];
sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1 ], B_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
// Output only one row
sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
w, 1 , params->sgr.w0 HIGHBD_TAIL_SUFFIX);
return ;
vert_1:
// Copy the last row as padding once
sumsq_ptrs[4 ] = sumsq_ptrs[3 ];
sum_ptrs[4 ] = sum_ptrs[3 ];
sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1 ], B_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
rotate(A_ptrs, B_ptrs, 2 );
goto output_1;
}
static void sgr_mix_c(pixel *dst, const ptrdiff_t stride,
const pixel (*left)[4 ], const pixel *lpf,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16 ,);
ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16 ,);
int32_t *sumsq5_ptrs[5 ], *sumsq5_rows[5 ];
coef *sum5_ptrs[5 ], *sum5_rows[5 ];
for (int i = 0 ; i < 5 ; i++) {
sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16 ,);
ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16 ,);
int32_t *sumsq3_ptrs[3 ], *sumsq3_rows[3 ];
coef *sum3_ptrs[3 ], *sum3_rows[3 ];
for (int i = 0 ; i < 3 ; i++) {
sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16 ,);
ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16 ,);
int32_t *A5_ptrs[2 ];
coef *B5_ptrs[2 ];
for (int i = 0 ; i < 2 ; i++) {
A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16 ,);
ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16 ,);
int32_t *A3_ptrs[4 ];
coef *B3_ptrs[4 ];
for (int i = 0 ; i < 4 ; i++) {
A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
}
const pixel *src = dst;
const pixel *lpf_bottom = lpf + 6 *PXSTRIDE(stride);
if (edges & LR_HAVE_TOP) {
sumsq5_ptrs[0 ] = sumsq5_rows[0 ];
sumsq5_ptrs[1 ] = sumsq5_rows[0 ];
sumsq5_ptrs[2 ] = sumsq5_rows[1 ];
sumsq5_ptrs[3 ] = sumsq5_rows[2 ];
sumsq5_ptrs[4 ] = sumsq5_rows[3 ];
sum5_ptrs[0 ] = sum5_rows[0 ];
sum5_ptrs[1 ] = sum5_rows[0 ];
sum5_ptrs[2 ] = sum5_rows[1 ];
sum5_ptrs[3 ] = sum5_rows[2 ];
sum5_ptrs[4 ] = sum5_rows[3 ];
sumsq3_ptrs[0 ] = sumsq3_rows[0 ];
sumsq3_ptrs[1 ] = sumsq3_rows[1 ];
sumsq3_ptrs[2 ] = sumsq3_rows[2 ];
sum3_ptrs[0 ] = sum3_rows[0 ];
sum3_ptrs[1 ] = sum3_rows[1 ];
sum3_ptrs[2 ] = sum3_rows[2 ];
sgr_box35_row_h(sumsq3_rows[0 ], sum3_rows[0 ],
sumsq5_rows[0 ], sum5_rows[0 ],
NULL, lpf, w, edges);
lpf += PXSTRIDE(stride);
sgr_box35_row_h(sumsq3_rows[1 ], sum3_rows[1 ],
sumsq5_rows[1 ], sum5_rows[1 ],
NULL, lpf, w, edges);
sgr_box35_row_h(sumsq3_rows[2 ], sum3_rows[2 ],
sumsq5_rows[2 ], sum5_rows[2 ],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
if (--h <= 0 )
goto vert_1;
sgr_box35_row_h(sumsq3_ptrs[2 ], sum3_ptrs[2 ],
sumsq5_rows[3 ], sum5_rows[3 ],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1 ], B5_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
rotate(A5_ptrs, B5_ptrs, 2 );
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
if (--h <= 0 )
goto vert_2;
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
// one of them to point at the previously unused rows[4].
sumsq5_ptrs[3 ] = sumsq5_rows[4 ];
sum5_ptrs[3 ] = sum5_rows[4 ];
} else {
sumsq5_ptrs[0 ] = sumsq5_rows[0 ];
sumsq5_ptrs[1 ] = sumsq5_rows[0 ];
sumsq5_ptrs[2 ] = sumsq5_rows[0 ];
sumsq5_ptrs[3 ] = sumsq5_rows[0 ];
sumsq5_ptrs[4 ] = sumsq5_rows[0 ];
sum5_ptrs[0 ] = sum5_rows[0 ];
sum5_ptrs[1 ] = sum5_rows[0 ];
sum5_ptrs[2 ] = sum5_rows[0 ];
sum5_ptrs[3 ] = sum5_rows[0 ];
sum5_ptrs[4 ] = sum5_rows[0 ];
sumsq3_ptrs[0 ] = sumsq3_rows[0 ];
sumsq3_ptrs[1 ] = sumsq3_rows[0 ];
sumsq3_ptrs[2 ] = sumsq3_rows[0 ];
sum3_ptrs[0 ] = sum3_rows[0 ];
sum3_ptrs[1 ] = sum3_rows[0 ];
sum3_ptrs[2 ] = sum3_rows[0 ];
sgr_box35_row_h(sumsq3_rows[0 ], sum3_rows[0 ],
sumsq5_rows[0 ], sum5_rows[0 ],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
if (--h <= 0 )
goto vert_1;
sumsq5_ptrs[4 ] = sumsq5_rows[1 ];
sum5_ptrs[4 ] = sum5_rows[1 ];
sumsq3_ptrs[2 ] = sumsq3_rows[1 ];
sum3_ptrs[2 ] = sum3_rows[1 ];
sgr_box35_row_h(sumsq3_rows[1 ], sum3_rows[1 ],
sumsq5_rows[1 ], sum5_rows[1 ],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1 ], B5_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
rotate(A5_ptrs, B5_ptrs, 2 );
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
if (--h <= 0 )
goto vert_2;
sumsq5_ptrs[3 ] = sumsq5_rows[2 ];
sumsq5_ptrs[4 ] = sumsq5_rows[3 ];
sum5_ptrs[3 ] = sum5_rows[2 ];
sum5_ptrs[4 ] = sum5_rows[3 ];
sumsq3_ptrs[2 ] = sumsq3_rows[2 ];
sum3_ptrs[2 ] = sum3_rows[2 ];
sgr_box35_row_h(sumsq3_rows[2 ], sum3_rows[2 ],
sumsq5_rows[2 ], sum5_rows[2 ],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
if (--h <= 0 )
goto odd;
sgr_box35_row_h(sumsq3_ptrs[2 ], sum3_ptrs[2 ],
sumsq5_rows[3 ], sum5_rows[3 ],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1 ], B5_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 2 , params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
if (--h <= 0 )
goto vert_2;
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
// one of them to point at the previously unused rows[4].
sumsq5_ptrs[3 ] = sumsq5_rows[4 ];
sum5_ptrs[3 ] = sum5_rows[4 ];
}
do {
sgr_box35_row_h(sumsq3_ptrs[2 ], sum3_ptrs[2 ],
sumsq5_ptrs[3 ], sum5_ptrs[3 ],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
if (--h <= 0 )
goto odd;
sgr_box35_row_h(sumsq3_ptrs[2 ], sum3_ptrs[2 ],
sumsq5_ptrs[4 ], sum5_ptrs[4 ],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1 ], B5_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 2 , params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
} while (--h > 0 );
if (!(edges & LR_HAVE_BOTTOM))
goto vert_2;
sgr_box35_row_h(sumsq3_ptrs[2 ], sum3_ptrs[2 ],
sumsq5_ptrs[3 ], sum5_ptrs[3 ],
NULL, lpf_bottom, w, edges);
lpf_bottom += PXSTRIDE(stride);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
sgr_box35_row_h(sumsq3_ptrs[2 ], sum3_ptrs[2 ],
sumsq5_ptrs[4 ], sum5_ptrs[4 ],
NULL, lpf_bottom, w, edges);
output_2:
sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1 ], B5_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 2 , params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
return ;
vert_2:
// Duplicate the last row twice more
sumsq5_ptrs[3 ] = sumsq5_ptrs[2 ];
sumsq5_ptrs[4 ] = sumsq5_ptrs[2 ];
sum5_ptrs[3 ] = sum5_ptrs[2 ];
sum5_ptrs[4 ] = sum5_ptrs[2 ];
sumsq3_ptrs[2 ] = sumsq3_ptrs[1 ];
sum3_ptrs[2 ] = sum3_ptrs[1 ];
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
sumsq3_ptrs[2 ] = sumsq3_ptrs[1 ];
sum3_ptrs[2 ] = sum3_ptrs[1 ];
goto output_2;
odd:
// Copy the last row as padding once
sumsq5_ptrs[4 ] = sumsq5_ptrs[3 ];
sum5_ptrs[4 ] = sum5_ptrs[3 ];
sumsq3_ptrs[2 ] = sumsq3_ptrs[1 ];
sum3_ptrs[2 ] = sum3_ptrs[1 ];
sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1 ], B5_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 2 , params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
output_1:
// Duplicate the last row twice more
sumsq5_ptrs[3 ] = sumsq5_ptrs[2 ];
sumsq5_ptrs[4 ] = sumsq5_ptrs[2 ];
sum5_ptrs[3 ] = sum5_ptrs[2 ];
sum5_ptrs[4 ] = sum5_ptrs[2 ];
sumsq3_ptrs[2 ] = sumsq3_ptrs[1 ];
sum3_ptrs[2 ] = sum3_ptrs[1 ];
sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1 ], B5_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
// Output only one row
sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 1 , params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
return ;
vert_1:
// Copy the last row as padding once
sumsq5_ptrs[4 ] = sumsq5_ptrs[3 ];
sum5_ptrs[4 ] = sum5_ptrs[3 ];
sumsq3_ptrs[2 ] = sumsq3_ptrs[1 ];
sum3_ptrs[2 ] = sum3_ptrs[1 ];
sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1 ], B5_ptrs[1 ],
w, params->sgr.s0, BITDEPTH_MAX);
rotate(A5_ptrs, B5_ptrs, 2 );
sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3 ], B3_ptrs[3 ],
w, params->sgr.s1, BITDEPTH_MAX);
rotate(A3_ptrs, B3_ptrs, 4 );
goto output_1;
}
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/looprestoration.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/looprestoration.h"
#elif ARCH_PPC64LE
#include "src/ppc/looprestoration.h"
#elif ARCH_X86
#include "src/x86/looprestoration.h"
#endif
#endif
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
const int bpc)
{
c->wiener[0 ] = c->wiener[1 ] = wiener_c;
c->sgr[0 ] = sgr_5x5_c;
c->sgr[1 ] = sgr_3x3_c;
c->sgr[2 ] = sgr_mix_c;
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
loop_restoration_dsp_init_arm(c, bpc);
#elif ARCH_LOONGARCH64
loop_restoration_dsp_init_loongarch(c, bpc);
#elif ARCH_PPC64LE
loop_restoration_dsp_init_ppc(c, bpc);
#elif ARCH_X86
loop_restoration_dsp_init_x86(c, bpc);
#endif
#endif
}
Messung V0.5 in Prozent C=91 H=78 G=84
¤ Dauer der Verarbeitung: 0.25 Sekunden
(vorverarbeitet am 2026-06-07)
¤
*© Formatika GbR, Deutschland