/*
* Copyright © 2024, VideoLAN and dav1d authors
* Copyright © 2024, Luca Barbato
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "common/attributes.h"
#include "src/ppc/mc.h"
#include "src/tables.h"
#include "src/ppc/dav1d_types.h"
#if BITDEPTH == 8
#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32 ) >> 6 )
typedef void (*blend_line)(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride);
#define BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3) \
{ \
u16x8 anm0 = vec_mule(ab0, nm_m0); \
u16x8 anm1 = vec_mule(ab1, nm_m1); \
u16x8 anm2 = vec_mule(ab2, nm_m2); \
u16x8 anm3 = vec_mule(ab3, nm_m3); \
\
u16x8 bm0 = vec_mulo(ab0, nm_m0); \
u16x8 bm1 = vec_mulo(ab1, nm_m1); \
u16x8 bm2 = vec_mulo(ab2, nm_m2); \
u16x8 bm3 = vec_mulo(ab3, nm_m3); \
\
d0_u16 = vec_add(anm0, bm0); \
d1_u16 = vec_add(anm1, bm1); \
d2_u16 = vec_add(anm2, bm2); \
d3_u16 = vec_add(anm3, bm3); \
\
d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32 )); \
d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32 )); \
d2_u16 = vec_add(d2_u16, vec_splats((uint16_t)32 )); \
d3_u16 = vec_add(d3_u16, vec_splats((uint16_t)32 )); \
\
d0_u16 = vec_sr(d0_u16, vec_splat_u16(6 )); \
d1_u16 = vec_sr(d1_u16, vec_splat_u16(6 )); \
d2_u16 = vec_sr(d2_u16, vec_splat_u16(6 )); \
d3_u16 = vec_sr(d3_u16, vec_splat_u16(6 )); \
}
#define BLEND_LINES3(d0_u16, d1_u16, d2_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_2) \
{ \
u16x8 anm0 = vec_mule(ab0, nm_m0); \
u16x8 anm1 = vec_mule(ab1, nm_m1); \
u16x8 anm2 = vec_mule(ab2, nm_m2); \
\
u16x8 bm0 = vec_mulo(ab0, nm_m0); \
u16x8 bm1 = vec_mulo(ab1, nm_m1); \
u16x8 bm2 = vec_mulo(ab2, nm_m2); \
\
d0_u16 = vec_add(anm0, bm0); \
d1_u16 = vec_add(anm1, bm1); \
d2_u16 = vec_add(anm2, bm2); \
\
d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32 )); \
d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32 )); \
d2_u16 = vec_add(d2_u16, vec_splats((uint16_t)32 )); \
\
d0_u16 = vec_sr(d0_u16, vec_splat_u16(6 )); \
d1_u16 = vec_sr(d1_u16, vec_splat_u16(6 )); \
d2_u16 = vec_sr(d2_u16, vec_splat_u16(6 )); \
}
#define BLEND_LINES2(d0_u16, d1_u16, ab0, ab1, nm_m0, nm_m1) \
{ \
u16x8 anm0 = vec_mule(ab0, nm_m0); \
u16x8 anm1 = vec_mule(ab1, nm_m1); \
\
u16x8 bm0 = vec_mulo(ab0, nm_m0); \
u16x8 bm1 = vec_mulo(ab1, nm_m1); \
\
d0_u16 = vec_add(anm0, bm0); \
d1_u16 = vec_add(anm1, bm1); \
\
d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32 )); \
d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32 )); \
\
d0_u16 = vec_sr(d0_u16, vec_splat_u16(6 )); \
d1_u16 = vec_sr(d1_u16, vec_splat_u16(6 )); \
}
static void blend4(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
{
u8x16 v64u8 = vec_splats((uint8_t)64 );
u8x16 a0 = vec_xl(0 , dst);
u8x16 a1 = vec_xl(0 , dst + stride);
u8x16 a2 = vec_xl(0 , dst + 2 * stride);
u8x16 a3 = vec_xl(0 , dst + 3 * stride);
u8x16 m0 = vec_xl(0 , mask);
u8x16 m1 = vec_xl(0 , mask + 4 );
u8x16 m2 = vec_xl(0 , mask + 2 * 4 );
u8x16 m3 = vec_xl(0 , mask + 3 * 4 );
u8x16 b0 = vec_xl(0 , tmp);
u8x16 b1 = vec_xl(0 , tmp + 4 );
u8x16 b2 = vec_xl(0 , tmp + 2 * 4 );
u8x16 b3 = vec_xl(0 , tmp + 3 * 4 );
u8x16 nm0 = vec_sub(v64u8, m0);
u8x16 nm1 = vec_sub(v64u8, m1);
u8x16 nm2 = vec_sub(v64u8, m2);
u8x16 nm3 = vec_sub(v64u8, m3);
u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd
u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd
u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd
u8x16 ab3 = vec_mergeh(a3, b3); // a even, b odd
u8x16 nm_m0 = vec_mergeh(nm0, m0);
u8x16 nm_m1 = vec_mergeh(nm1, m1);
u8x16 nm_m2 = vec_mergeh(nm2, m2);
u8x16 nm_m3 = vec_mergeh(nm3, m3);
u16x8 d0_u16, d1_u16, d2_u16, d3_u16;
BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3);
u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16);
u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16);
u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16);
u8x16 d3 = (u8x16)vec_pack(d3_u16, d3_u16);
vec_xst_len(d0, dst, 4 );
vec_xst_len(d1, dst + stride, 4 );
vec_xst_len(d2, dst + 2 * stride, 4 );
vec_xst_len(d3, dst + 3 * stride, 4 );
}
static void blend8(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
{
u8x16 v64u8 = vec_splats((uint8_t)64 );
u8x16 a0 = vec_xl(0 , dst);
u8x16 a1 = vec_xl(0 , dst + stride);
u8x16 a2 = vec_xl(0 , dst + 2 * stride);
u8x16 a3 = vec_xl(0 , dst + 3 * stride);
u8x16 m0 = vec_xl(0 , mask);
u8x16 m1 = vec_xl(0 , mask + 8 );
u8x16 m2 = vec_xl(0 , mask + 2 * 8 );
u8x16 m3 = vec_xl(0 , mask + 3 * 8 );
u8x16 b0 = vec_xl(0 , tmp);
u8x16 b1 = vec_xl(0 , tmp + 8 );
u8x16 b2 = vec_xl(0 , tmp + 2 * 8 );
u8x16 b3 = vec_xl(0 , tmp + 3 * 8 );
u8x16 nm0 = vec_sub(v64u8, m0);
u8x16 nm1 = vec_sub(v64u8, m1);
u8x16 nm2 = vec_sub(v64u8, m2);
u8x16 nm3 = vec_sub(v64u8, m3);
u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd
u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd
u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd
u8x16 ab3 = vec_mergeh(a3, b3); // a even, b odd
u8x16 nm_m0 = vec_mergeh(nm0, m0);
u8x16 nm_m1 = vec_mergeh(nm1, m1);
u8x16 nm_m2 = vec_mergeh(nm2, m2);
u8x16 nm_m3 = vec_mergeh(nm3, m3);
u16x8 d0_u16, d1_u16, d2_u16, d3_u16;
BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3);
u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16);
u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16);
u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16);
u8x16 d3 = (u8x16)vec_pack(d3_u16, d3_u16);
vec_xst_len(d0, dst, 8 );
vec_xst_len(d1, dst + stride, 8 );
vec_xst_len(d2, dst + 2 * stride, 8 );
vec_xst_len(d3, dst + 3 * stride, 8 );
}
static inline void blend16_lines(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride)
{
u8x16 v64u8 = vec_splats((uint8_t)64 );
u8x16 a0 = vec_xl(0 , dst);
u8x16 a1 = vec_xl(0 , dst + stride);
u8x16 a2 = vec_xl(0 , dst + 2 * stride);
u8x16 a3 = vec_xl(0 , dst + 3 * stride);
u8x16 m0 = vec_xl(0 , mask);
u8x16 m1 = vec_xl(0 , mask + mstride);
u8x16 m2 = vec_xl(0 , mask + 2 * mstride);
u8x16 m3 = vec_xl(0 , mask + 3 * mstride);
u8x16 b0 = vec_xl(0 , tmp);
u8x16 b1 = vec_xl(0 , tmp + mstride);
u8x16 b2 = vec_xl(0 , tmp + 2 * mstride);
u8x16 b3 = vec_xl(0 , tmp + 3 * mstride);
u8x16 nm0 = vec_sub(v64u8, m0);
u8x16 nm1 = vec_sub(v64u8, m1);
u8x16 nm2 = vec_sub(v64u8, m2);
u8x16 nm3 = vec_sub(v64u8, m3);
u8x16 ab0 = vec_mergeh(a0, b0);
u8x16 ab1 = vec_mergeh(a1, b1);
u8x16 ab2 = vec_mergeh(a2, b2);
u8x16 ab3 = vec_mergeh(a3, b3);
u8x16 nm_m0 = vec_mergeh(nm0, m0);
u8x16 nm_m1 = vec_mergeh(nm1, m1);
u8x16 nm_m2 = vec_mergeh(nm2, m2);
u8x16 nm_m3 = vec_mergeh(nm3, m3);
u16x8 d0h_u16, d1h_u16, d2h_u16, d3h_u16;
u16x8 d0l_u16, d1l_u16, d2l_u16, d3l_u16;
BLEND_LINES4(d0h_u16, d1h_u16, d2h_u16, d3h_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3)
ab0 = vec_mergel(a0, b0);
ab1 = vec_mergel(a1, b1);
ab2 = vec_mergel(a2, b2);
ab3 = vec_mergel(a3, b3);
nm_m0 = vec_mergel(nm0, m0);
nm_m1 = vec_mergel(nm1, m1);
nm_m2 = vec_mergel(nm2, m2);
nm_m3 = vec_mergel(nm3, m3);
BLEND_LINES4(d0l_u16, d1l_u16, d2l_u16, d3l_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3)
u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16);
u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16);
u8x16 d2 = (u8x16)vec_pack(d2h_u16, d2l_u16);
u8x16 d3 = (u8x16)vec_pack(d3h_u16, d3l_u16);
vec_xst(d0, 0 ,dst);
vec_xst(d1, 0 ,dst + stride);
vec_xst(d2, 0 ,dst + 2 * stride);
vec_xst(d3, 0 ,dst + 3 * stride);
}
static void blend16(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
{
blend16_lines(dst, tmp, mask, stride, 16 );
}
static void blend32(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
{
for (int i = 0 ; i < 2 ; i++, dst += 16 , tmp += 16 , mask += 16 ) {
blend16_lines(dst, tmp, mask, stride, 32 );
}
}
static blend_line blend_funcs[4 ] = {
blend4, blend8, blend16, blend32
};
void dav1d_blend_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h, const uint8_t *mask)
{
assert(w <= 32 );
blend_line blend = blend_funcs[ctz(w) - 2 ];
for (int y = 0 ; y < h; y+=4 ) {
blend(dst, tmp, mask, PXSTRIDE(dst_stride));
dst += 4 * PXSTRIDE(dst_stride);
tmp += 4 * w;
mask += 4 * w;
}
}
static inline void blend_v_h(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l)
{
u8x16 v64u8 = vec_splats((uint8_t)64 );
u8x16 a0 = vec_xl(0 , dst);
u8x16 a1 = vec_xl(0 , dst + stride);
u8x16 m0 = vec_xl(0 , mask);
u8x16 b0 = vec_xl(0 , tmp);
u8x16 b1 = vec_xl(0 , tmp + mstride);
u8x16 nm0 = vec_sub(v64u8, m0);
u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd
u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd
u8x16 nm_m0 = vec_mergeh(nm0, m0);
u16x8 d0_u16, d1_u16;
BLEND_LINES2(d0_u16, d1_u16, ab0, ab1, nm_m0, nm_m0);
u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16);
u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16);
vec_xst_len(d0, dst, l);
vec_xst_len(d1, dst + stride, l);
}
static inline void blend_v_hl(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l)
{
u8x16 v64u8 = vec_splats((uint8_t)64 );
u8x16 a0 = vec_xl(0 , dst);
u8x16 a1 = vec_xl(0 , dst + stride);
u8x16 m0 = vec_xl(0 , mask);
u8x16 b0 = vec_xl(0 , tmp);
u8x16 b1 = vec_xl(0 , tmp + mstride);
u8x16 nm0 = vec_sub(v64u8, m0);
u8x16 ab0 = vec_mergeh(a0, b0);
u8x16 ab1 = vec_mergeh(a1, b1);
u8x16 nm_m0 = vec_mergeh(nm0, m0);
u16x8 d0h_u16, d1h_u16;
u16x8 d0l_u16, d1l_u16;
BLEND_LINES2(d0h_u16, d1h_u16, ab0, ab1, nm_m0, nm_m0)
ab0 = vec_mergel(a0, b0);
ab1 = vec_mergel(a1, b1);
nm_m0 = vec_mergel(nm0, m0);
BLEND_LINES2(d0l_u16, d1l_u16, ab0, ab1,nm_m0, nm_m0)
u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16);
u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16);
vec_xst_len(d0, dst, l);
vec_xst_len(d1, dst + stride, l);
}
static void blend_v3(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
{
blend_v_h(dst, tmp, mask, stride, 4 , 3 );
}
static void blend_v6(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
{
blend_v_h(dst, tmp, mask, stride, 8 , 6 );
}
static void blend_v12(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
{
blend_v_hl(dst, tmp, mask, stride, 16 , 12 );
}
static void blend_v24(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
{
blend_v_hl(dst, tmp, mask, stride, 32 , 16 );
blend_v_h(dst + 16 , tmp + 16 , mask + 16 , stride, 32 , 8 );
}
static void blend_v1(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
{
dst[0 ] = blend_px(dst[0 ], tmp[0 ], mask[0 ]);
dst[stride] = blend_px(dst[stride], tmp[2 ], mask[0 ]);
}
static blend_line blend_v_funcs[5 ] = {
blend_v1, blend_v3, blend_v6, blend_v12, blend_v24
};
void dav1d_blend_v_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h)
{
const uint8_t *const mask = &dav1d_obmc_masks[w];
assert(w <= 32 );
blend_line blend = blend_v_funcs[ctz(w) - 1 ];
for (int y = 0 ; y < h; y+=2 ) {
blend(dst, tmp, mask, PXSTRIDE(dst_stride));
dst += 2 * PXSTRIDE(dst_stride);
tmp += 2 * w;
}
}
static inline void blend_h_h(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l)
{
u8x16 v64u8 = vec_splats((uint8_t)64 );
u8x16 a0 = vec_xl(0 , dst);
u8x16 a1 = vec_xl(0 , dst + stride);
u8x16 a2 = vec_xl(0 , dst + 2 * stride);
u8x16 m = vec_xl(0 , mask);
u8x16 b0 = vec_xl(0 , tmp);
u8x16 b1 = vec_xl(0 , tmp + mstride);
u8x16 b2 = vec_xl(0 , tmp + 2 * mstride);
u8x16 m0 = vec_splat(m, 0 );
u8x16 m1 = vec_splat(m, 1 );
u8x16 m2 = vec_splat(m, 2 );
u8x16 nm0 = vec_sub(v64u8, m0);
u8x16 nm1 = vec_sub(v64u8, m1);
u8x16 nm2 = vec_sub(v64u8, m2);
u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd
u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd
u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd
u8x16 nm_m0 = vec_mergeh(nm0, m0);
u8x16 nm_m1 = vec_mergeh(nm1, m1);
u8x16 nm_m2 = vec_mergeh(nm2, m2);
u16x8 d0_u16, d1_u16, d2_u16;
BLEND_LINES3(d0_u16, d1_u16, d2_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2);
u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16);
u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16);
u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16);
vec_xst_len(d0, dst, l);
vec_xst_len(d1, dst + stride, l);
vec_xst_len(d2, dst + 2 * stride, l);
}
static inline void blend_h_hl(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride)
{
u8x16 v64u8 = vec_splats((uint8_t)64 );
u8x16 a0 = vec_xl(0 , dst);
u8x16 a1 = vec_xl(0 , dst + stride);
u8x16 a2 = vec_xl(0 , dst + 2 * stride);
u8x16 m = vec_xl(0 , mask);
u8x16 b0 = vec_xl(0 , tmp);
u8x16 b1 = vec_xl(0 , tmp + mstride);
u8x16 b2 = vec_xl(0 , tmp + 2 * mstride);
u8x16 m0 = vec_splat(m, 0 );
u8x16 m1 = vec_splat(m, 1 );
u8x16 m2 = vec_splat(m, 2 );
u8x16 nm0 = vec_sub(v64u8, m0);
u8x16 nm1 = vec_sub(v64u8, m1);
u8x16 nm2 = vec_sub(v64u8, m2);
u8x16 ab0 = vec_mergeh(a0, b0);
u8x16 ab1 = vec_mergeh(a1, b1);
u8x16 ab2 = vec_mergeh(a2, b2);
u8x16 nm_m0 = vec_mergeh(nm0, m0);
u8x16 nm_m1 = vec_mergeh(nm1, m1);
u8x16 nm_m2 = vec_mergeh(nm2, m2);
u16x8 d0h_u16, d1h_u16, d2h_u16;
u16x8 d0l_u16, d1l_u16, d2l_u16;
BLEND_LINES3(d0h_u16, d1h_u16, d2h_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2)
ab0 = vec_mergel(a0, b0);
ab1 = vec_mergel(a1, b1);
ab2 = vec_mergel(a2, b2);
nm_m0 = vec_mergel(nm0, m0);
nm_m1 = vec_mergel(nm1, m1);
nm_m2 = vec_mergel(nm2, m2);
BLEND_LINES3(d0l_u16, d1l_u16, d2l_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2)
u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16);
u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16);
u8x16 d2 = (u8x16)vec_pack(d2h_u16, d2l_u16);
vec_xst(d0, 0 , dst);
vec_xst(d1, 0 ,dst + stride);
vec_xst(d2, 0 ,dst + 2 * stride);
}
static void blend_h2(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
for (int y = 0 ; y < 3 ; y++) {
const int m = *mask++;
for (int x = 0 ; x < 2 ; x++) {
dst[x] = blend_px(dst[x], tmp[x], m);
}
dst += stride;
tmp += 2 ;
}
}
static void blend_h4(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
blend_h_h(dst, tmp, mask, stride, 4 , 4 );
}
static void blend_h8(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
blend_h_h(dst, tmp, mask, stride, 8 , 8 );
}
static void blend_h16(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
blend_h_hl(dst, tmp, mask, stride, 16 );
}
static void blend_h32(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
blend_h_hl(dst, tmp, mask, stride, 32 );
blend_h_hl(dst + 16 , tmp + 16 , mask, stride, 32 );
}
static void blend_h64(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
blend_h_hl(dst, tmp, mask, stride, 64 );
blend_h_hl(dst + 16 , tmp + 16 , mask, stride, 64 );
blend_h_hl(dst + 32 , tmp + 32 , mask, stride, 64 );
blend_h_hl(dst + 48 , tmp + 48 , mask, stride, 64 );
}
static void blend_h128(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
for (int i = 0 ; i < 2 ; i++, dst += 64 , tmp += 64 ) {
blend_h_hl(dst, tmp, mask, stride, 128 );
blend_h_hl(dst + 16 , tmp + 16 , mask, stride, 128 );
blend_h_hl(dst + 32 , tmp + 32 , mask, stride, 128 );
blend_h_hl(dst + 48 , tmp + 48 , mask, stride, 128 );
}
}
static blend_line blend_h_funcs[7 ] = {
blend_h2, blend_h4, blend_h8, blend_h16, blend_h32, blend_h64, blend_h128
};
void dav1d_blend_h_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h)
{
const uint8_t *mask = &dav1d_obmc_masks[h];
h = (h * 3 ) >> 2 ;
assert(w <= 128 );
blend_line blend = blend_h_funcs[ctz(w) - 1 ];
if (h == 1 ) {
const int m = *mask++;
for (int x = 0 ; x < w; x++) {
dst[x] = blend_px(dst[x], tmp[x], m);
}
} else
for (int y = 0 ; y < h; y+=3 ) {
blend(dst, tmp, mask, PXSTRIDE(dst_stride));
dst += 3 * PXSTRIDE(dst_stride);
tmp += 3 * w;
mask += 3 ;
}
}
#endif // BITDEPTH
Messung V0.5 in Prozent C=94 H=92 G=92
¤ Dauer der Verarbeitung: 0.13 Sekunden
(vorverarbeitet am 2026-06-05)
¤
*© Formatika GbR, Deutschland