; Copyright © 2022, VideoLAN and dav1d authors
; Copyright © 2022, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%include "x86/filmgrain_common.asm"
%if ARCH_X86_64
SECTION_RODATA 64
pb_even: db 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 16 , 18 , 20 , 22 , 24 , 26 , 28 , 30
db 32 , 34 , 36 , 38 , 40 , 42 , 44 , 46 , 48 , 50 , 52 , 54 , 56 , 58 , 60 , 62
db 64 , 66 , 68 , 70 , 72 , 74 , 76 , 78 , 80 , 82 , 84 , 86 , 88 , 90 , 92 , 94
db 96 , 98 ,100 ,102 ,104 ,106 ,108 ,110 ,112 ,114 ,116 ,118 ,120 ,122 ,124 ,126
pb_odd: db 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 17 , 19 , 21 , 23 , 25 , 27 , 29 , 31
db 33 , 35 , 37 , 39 , 41 , 43 , 45 , 47 , 49 , 51 , 53 , 55 , 57 , 59 , 61 , 63
db 65 , 67 , 69 , 71 , 73 , 75 , 77 , 79 , 81 , 83 , 85 , 87 , 89 , 91 , 93 , 95
db 97 , 99 ,101 ,103 ,105 ,107 ,109 ,111 ,113 ,115 ,117 ,119 ,121 ,123 ,125 ,127
interleave_hl: db 8 , 0 , 9 , 1 , 10 , 2 , 11 , 3 , 12 , 4 , 13 , 5 , 14 , 6 , 15 , 7
pb_27_17_17_27: db 27 , 17 , 17 , 27 , 0 , 32 , 0 , 32
pb_23_22_0_32: db 23 , 22 , 0 , 32 , 0 , 32 , 0 , 32
pb_27_17: times 2 db 27 , 17
pb_23_22: times 2 db 23 , 22
pw_8: times 2 dw 8
pw_1024: times 2 dw 1024
pb_17_27: times 2 db 17 , 27
fg_max: times 4 db 255
times 4 db 240
times 4 db 235
fg_min: times 4 db 0
times 4 db 16
noise_rnd: times 2 dw 128
times 2 dw 64
times 2 dw 32
times 2 dw 16
SECTION .text
INIT_ZMM avx512icl
cglobal fgy_32x32xn_8bpc, 6 , 13 , 22 , dst, src, stride, fg_data, w, scaling, \
grain_lut, h, sby, see, overlap
%define base r11-fg_min
lea r11, [fg_min]
mov r6d, [fg_dataq+FGData.scaling_shift]
mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
mov sbyd, sbym
mov overlapd, [fg_dataq+FGData.overlap_flag]
mov r12, 0 x0000000f0000000f ; h_overlap mask
mova m0, [scalingq+64 *0 ]
mova m1, [scalingq+64 *1 ]
mova m2, [scalingq+64 *2 ]
mova m3, [scalingq+64 *3 ]
kmovq k1, r12
vbroadcasti32x4 m4, [base+interleave_hl]
vpbroadcastd ym16, [base+pb_27_17]
vpbroadcastd m12, [base+pb_17_27]
vpbroadcastd m6, [base+noise_rnd+r6*4 -32 ]
test sbyd, sbyd
setnz r6b
vpbroadcastd m7, [base+fg_min+r7*4 ]
vpbroadcastd m8, [base+fg_max+r7*8 ]
pxor m5, m5
vpbroadcastd m9, [base+pw_1024]
vpbroadcastq m10, [base+pb_27_17_17_27]
vmovdqa64 m12{k1}, m16
test r6b, overlapb
jnz .v_overlap
imul seed, sbyd, (173 << 24 ) | 37
add seed, (105 << 24 ) | 178
rorx seed, seed, 24
movzx seed, seew
xor seed, [fg_dataq+FGData.seed]
DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
h, sby, see, overlap
lea src_bakq, [srcq+wq]
neg wq
sub dstq, srcq
.loop_x:
rorx r6, seeq, 1
or seed, 0 xeff4
test seeb, seeh
lea seed, [r6+0 x8000]
cmovp seed, r6d ; updated seed
rorx offyd, seed, 8
rorx offxq, seeq, 12
and offyd, 0 xf
imul offyd, 164
lea offxd, [offyq+offxq*2 +829 ] ; offy*stride+offx
DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
h, sby, see, overlap
mov grain_lutq, grain_lutmp
mov hd, hm
.loop_y:
movu ym21, [grain_lutq+offxyq-82 ]
vinserti32x8 m21, [grain_lutq+offxyq+ 0 ], 1
call .add_noise
sub hb, 2
jg .loop_y
add wq, 32
jge .end
lea srcq, [src_bakq+wq]
test overlapd, overlapd
jz .loop_x
test sbyd, sbyd
jnz .hv_overlap
.loop_x_h_overlap:
rorx r6, seeq, 1
or seed, 0 xeff4
test seeb, seeh
lea seed, [r6+0 x8000]
cmovp seed, r6d ; updated seed
DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
h, sby, see, left_offxy
rorx offyd, seed, 8
mov left_offxyd, offxd ; previous column's offy*stride
rorx offxq, seeq, 12
and offyd, 0 xf
imul offyd, 164
lea offxd, [offyq+offxq*2 +829 ] ; offy*stride+offx
DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
h, sby, see, left_offxy
mov grain_lutq, grain_lutmp
mov hd, hm
.loop_y_h_overlap:
movu ym20, [grain_lutq+offxyq-82 ]
vinserti32x8 m20, [grain_lutq+offxyq+ 0 ], 1
movd xm19, [grain_lutq+left_offxyq-50 ]
vinserti32x4 m19, [grain_lutq+left_offxyq+32 ], 2
punpcklbw m19, m20
pmaddubsw m19, m10, m19
pmulhrsw m19, m9
punpckhbw m21, m20, m5
packsswb m20{k1}, m19, m19
punpcklbw m20, m5, m20
call .add_noise_h
sub hb, 2
jg .loop_y_h_overlap
add wq, 32
jge .end
lea srcq, [src_bakq+wq]
test sbyd, sbyd
jnz .hv_overlap
jmp .loop_x_h_overlap
.v_overlap:
DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
h, sby, see, overlap
movzx r6d, sbyb
imul seed, [fg_dataq+FGData.seed], 0 x00010001
imul r7d, r6d, 173 * 0 x00010001
imul r6d, 37 * 0 x01000100
add r7d, (105 << 16 ) | 188
add r6d, (178 << 24 ) | (141 << 8 )
and r7d, 0 x00ff00ff
and r6d, 0 xff00ff00
xor seed, r7d
xor seed, r6d ; (cur_seed << 16) | top_seed
DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
h, sby, see, overlap
lea src_bakq, [srcq+wq]
neg wq
sub dstq, srcq
; we assume from the block above that bits 8-15 of r7d are zero'ed
mov r6d, seed
or seed, 0 xeff4eff4
test seeb, seeh
setp r7b ; parity of top_seed
shr seed, 16
shl r7d, 16
test seeb, seeh
setp r7b ; parity of cur_seed
or r6d, 0 x00010001
xor r7d, r6d
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
rorx offyd, seed, 8
rorx offxd, seed, 12
and offyd, 0 xf000f
and offxd, 0 xf000f
imul offyd, 164
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offxd, [offyq+offxq*2 +0 x10001*829 +32 *82 ]
DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
h, sby, see, overlap, top_offxy
mov grain_lutq, grain_lutmp
mov hd, hm
movzx top_offxyd, offxyw
shr offxyd, 16
movu ym19, [grain_lutq+offxyq-82 ]
vinserti32x8 m19, [grain_lutq+offxyq+ 0 ], 1
movu ym21, [grain_lutq+top_offxyq-82 ]
vinserti32x8 m21, [grain_lutq+top_offxyq+ 0 ], 1
punpckhbw m20, m21, m19
punpcklbw m21, m19
call .add_noise_v
sub hb, 2
jg .loop_y
add wq, 32
jge .end
lea srcq, [src_bakq+wq]
; since fg_dataq.overlap is guaranteed to be set, we never jump back
; to .v_overlap, and instead always fall-through to h+v overlap
.hv_overlap:
; we assume from the block above that bits 8-15 of r7d are zero'ed
mov r6d, seed
or seed, 0 xeff4eff4
test seeb, seeh
setp r7b ; parity of top_seed
shr seed, 16
shl r7d, 16
test seeb, seeh
setp r7b ; parity of cur_seed
or r6d, 0 x00010001
xor r7d, r6d
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
h, sby, see, left_offxy, top_offxy, topleft_offxy
mov topleft_offxyd, top_offxyd
rorx offyd, seed, 8
mov left_offxyd, offxd
rorx offxd, seed, 12
and offyd, 0 xf000f
and offxd, 0 xf000f
imul offyd, 164
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offxd, [offyq+offxq*2 +0 x10001*829 +32 *82 ]
DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
h, sby, see, left_offxy, top_offxy, topleft_offxy
mov grain_lutq, grain_lutmp
mov hd, hm
movzx top_offxyd, offxyw
shr offxyd, 16
movu ym19, [grain_lutq+offxyq-82 ]
vinserti32x8 m19, [grain_lutq+offxyq+ 0 ], 1
movd xm16, [grain_lutq+left_offxyq-50 ]
vinserti32x4 m16, [grain_lutq+left_offxyq+32 ], 2
movu ym21, [grain_lutq+top_offxyq-82 ]
vinserti32x8 m21, [grain_lutq+top_offxyq+ 0 ], 1
movd xm17, [grain_lutq+topleft_offxyq-50 ]
vinserti32x4 m17, [grain_lutq+topleft_offxyq+32 ], 2
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m16, m19
pmaddubsw m16, m10, m16
punpcklbw m17, m21
pmaddubsw m17, m10, m17
punpckhbw m20, m21, m19
pmulhrsw m16, m9
pmulhrsw m17, m9
packsswb m19{k1}, m16, m16
packsswb m21{k1}, m17, m17
; followed by v interpolation (top | cur -> cur)
punpcklbw m21, m19
call .add_noise_v
sub hb, 2
jg .loop_y_h_overlap
add wq, 32
lea srcq, [src_bakq+wq]
jl .hv_overlap
.end :
RET
ALIGN function_align
.add_noise_v:
pmaddubsw m20, m12, m20
pmaddubsw m21, m12, m21
pmulhrsw m20, m9
pmulhrsw m21, m9
packsswb m21, m20
.add_noise:
punpcklbw m20, m5, m21
punpckhbw m21, m5
.add_noise_h:
mova ym18, [srcq+strideq*0 ]
vinserti32x8 m18, [srcq+strideq*1 ], 1
mova m19, m0
punpcklbw m16, m18, m5
vpermt2b m19, m18, m1 ; scaling[ 0..127]
vpmovb2m k2, m18
punpckhbw m17, m18, m5
vpermi2b m18, m2, m3 ; scaling[128..255]
vmovdqu8 m19{k2}, m18 ; scaling[src]
pshufb m19, m4
pmaddubsw m18, m19, m20
pmaddubsw m19, m21
add grain_lutq, 82 *2
pmulhrsw m18, m6 ; noise
pmulhrsw m19, m6
paddw m16, m18
paddw m17, m19
packuswb m16, m17
pmaxub m16, m7
pminub m16, m8
mova [dstq+srcq], ym16
add srcq, strideq
vextracti32x8 [dstq+srcq], m16, 1
add srcq, strideq
ret
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
cglobal fguv_32x32xn_i%1 _8 bpc, 6 , 14 +%2 , 22 , dst, src, stride, fg_data, w, \
scaling, grain_lut, h, sby, luma, \
overlap, uv_pl, is_id, _, stride3
lea r11, [fg_min]
mov r6d, [fg_dataq+FGData.scaling_shift]
mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
mov r9d, is_idm
mov sbyd, sbym
mov overlapd, [fg_dataq+FGData.overlap_flag]
%if %2
mov r12, 0 x000f000f000f000f ; h_overlap mask
vpbroadcastq m10, [base+pb_23_22_0_32]
lea stride3q, [strideq*3 ]
%else
mov r12, 0 x0000000f0000000f
vpbroadcastq m10, [base+pb_27_17_17_27]
%endif
mova m0, [scalingq+64 *0 ]
mova m1, [scalingq+64 *1 ]
mova m2, [scalingq+64 *2 ]
mova m3, [scalingq+64 *3 ]
kmovq k1, r12
vbroadcasti32x4 m4, [base+interleave_hl]
vpbroadcastd m6, [base+noise_rnd+r6*4 -32 ]
vpbroadcastd m7, [base+fg_min+r7*4 ]
shlx r7d, r7d, r9d
vpbroadcastd m8, [base+fg_max+r7*4 ]
test sbyd, sbyd
setnz r7b
vpbroadcastd m9, [base+pw_1024]
mova m11, [base+pb_even]
mova m12, [base+pb_odd]
pxor m5, m5
mov r5, r10mp ; lstride
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
h, sby, see, overlap, uv_pl, _, _, stride3
%if %1
mov r6d, uv_plm
vpbroadcastd m16, [base+pw_8]
vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4 ]
vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4 ]
pshufb m14, m16 ; uv_luma_mult, uv_mult
%endif
test r7b, overlapb
jnz %%v_overlap
imul seed, sbyd, (173 << 24 ) | 37
add seed, (105 << 24 ) | 178
rorx seed, seed, 24
movzx seed, seew
xor seed, [fg_dataq+FGData.seed]
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
offx, offy, see, overlap, _, _, _, stride3
mov lumaq, r9mp
lea r11, [srcq+wq]
lea r12, [dstq+wq]
lea r13, [lumaq+wq*(1 +%2 )]
mov r11mp, r11
mov r12mp, r12
neg wq
%%loop_x:
rorx r6, seeq, 1
or seed, 0 xeff4
test seeb, seeh
lea seed, [r6+0 x8000]
cmovp seed, r6d ; updated seed
rorx offyd, seed, 8
rorx offxq, seeq, 12
and offyd, 0 xf
imul offyd, 164 >>%3
lea offyd, [offyq+offxq*(2 -%2 )+(3 +(6 >>%3 ))*82 +3 +(6 >>%2 )] ; offy*stride+offx
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
h, offxy, see, overlap, _, _, _, stride3
mov grain_lutq, grain_lutmp
mov hd, hm
%%loop_y:
%if %2
movu xm21, [grain_lutq+offxyq+82 *0 ]
vinserti128 ym21, [grain_lutq+offxyq+82 *1 ], 1
vinserti32x4 m21, [grain_lutq+offxyq+82 *2 ], 2
vinserti32x4 m21, [grain_lutq+offxyq+82 *3 ], 3
%else
movu ym21, [grain_lutq+offxyq+82 *0 ]
vinserti32x8 m21, [grain_lutq+offxyq+82 *1 ], 1
%endif
call %%add_noise
sub hb, 2 <<%2
jg %%loop_y
add wq, 32 >>%2
jge .end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r13+wq*(1 <<%2 )]
add srcq, wq
add dstq, wq
test overlapd, overlapd
jz %%loop_x
cmp dword r8m, 0 ; sby
jne %%hv_overlap
; horizontal overlap (without vertical overlap)
%%loop_x_h_overlap:
rorx r6, seeq, 1
or seed, 0 xeff4
test seeb, seeh
lea seed, [r6+0 x8000]
cmovp seed, r6d ; updated seed
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
offx, offy, see, left_offxy, _, _, _, stride3
lea left_offxyd, [offyq+(32 >>%2 )] ; previous column's offy*stride+offx
rorx offyd, seed, 8
rorx offxq, seeq, 12
and offyd, 0 xf
imul offyd, 164 >>%3
lea offyd, [offyq+offxq*(2 -%2 )+(3 +(6 >>%3 ))*82 +3 +(6 >>%2 )] ; offy*stride+offx
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
h, offxy, see, left_offxy, _, _, _, stride3
mov grain_lutq, grain_lutmp
mov hd, hm
%%loop_y_h_overlap:
%if %2
movu xm20, [grain_lutq+offxyq +82 *0 ]
movd xm19, [grain_lutq+left_offxyq+82 *0 ]
vinserti32x4 ym20, [grain_lutq+offxyq +82 *1 ], 1
vinserti32x4 ym19, [grain_lutq+left_offxyq+82 *1 ], 1
vinserti32x4 m20, [grain_lutq+offxyq +82 *2 ], 2
vinserti32x4 m19, [grain_lutq+left_offxyq+82 *2 ], 2
vinserti32x4 m20, [grain_lutq+offxyq +82 *3 ], 3
vinserti32x4 m19, [grain_lutq+left_offxyq+82 *3 ], 3
%else
movu ym20, [grain_lutq+offxyq + 0 ]
movd xm19, [grain_lutq+left_offxyq+ 0 ]
vinserti32x8 m20, [grain_lutq+offxyq +82 ], 1
vinserti32x4 m19, [grain_lutq+left_offxyq+82 ], 2
%endif
punpcklbw m19, m20
pmaddubsw m19, m10, m19
punpckhbw m21, m20, m5
pmulhrsw m19, m9
vpacksswb m20{k1}, m19, m19
punpcklbw m20, m5, m20
call %%add_noise_h
sub hb, 2 <<%2
jg %%loop_y_h_overlap
add wq, 32 >>%2
jge .end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r13+wq*(1 <<%2 )]
add srcq, wq
add dstq, wq
cmp dword r8m, 0 ; sby
jne %%hv_overlap
jmp %%loop_x_h_overlap
%%v_overlap:
DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
_, sby, see, overlap, _, _, _, stride3
movzx sbyd, sbyb
imul seed, [fg_dataq+FGData.seed], 0 x00010001
imul r7d, sbyd, 173 * 0 x00010001
imul sbyd, 37 * 0 x01000100
add r7d, (105 << 16 ) | 188
add sbyd, (178 << 24 ) | (141 << 8 )
and r7d, 0 x00ff00ff
and sbyd, 0 xff00ff00
xor seed, r7d
xor seed, sbyd ; (cur_seed << 16) | top_seed
%if %3
vpbroadcastd m13, [base+pb_23_22]
kxnorw k3, k3, k3 ; v_overlap mask
%elif %2
vbroadcasti32x8 m13, [base+pb_27_17]
kxnord k3, k3, k3
pshufd m13, m13, q0000 ; 8x27_17, 8x17_27
%else
vpbroadcastd ym16, [base+pb_27_17]
vpbroadcastd m13, [base+pb_17_27]
vmovdqa64 m13{k1}, m16
%endif
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
offx, offy, see, overlap, top_offxy, _, _, stride3
mov lumaq, r9mp
lea r11, [srcq+wq]
lea r12, [dstq+wq]
lea r13, [lumaq+wq*(1 <<%2 )]
mov r11mp, r11
mov r12mp, r12
neg wq
; we assume from the block above that bits 8-15 of r7d are zero'ed
mov r6d, seed
or seed, 0 xeff4eff4
test seeb, seeh
setp r7b ; parity of top_seed
shr seed, 16
shl r7d, 16
test seeb, seeh
setp r7b ; parity of cur_seed
or r6d, 0 x00010001
xor r7d, r6d
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
rorx offyd, seed, 8
rorx offxd, seed, 12
and offyd, 0 x000f000f
and offxd, 0 x000f000f
imul offyd, 164 >>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyd, [offyq+offxq*(2 -%2 )+0 x10001*((3 +(6 >>%3 ))*82 +3 +(6 >>%2 ))+(32 >>%3 )*82 ]
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
h, offxy, see, overlap, top_offxy, _, _, stride3
mov grain_lutq, grain_lutmp
mov hd, hm
movzx top_offxyd, offxyw
shr offxyd, 16
%if %3
movu xm18, [grain_lutq+offxyq+82 *0 ]
movu xm20, [grain_lutq+top_offxyq+82 *0 ]
; only interpolate first line, insert remaining line unmodified
vbroadcasti128 ym21, [grain_lutq+offxyq+82 *1 ]
vinserti32x4 m21, [grain_lutq+offxyq+82 *2 ], 2
vinserti32x4 m21, [grain_lutq+offxyq+82 *3 ], 3
punpcklbw xm19, xm20, xm18
punpckhbw xm20, xm18
%elif %2
movu xm18, [grain_lutq+offxyq+82 *0 ]
vinserti128 ym18, [grain_lutq+offxyq+82 *1 ], 1
movu xm20, [grain_lutq+top_offxyq+82 *0 ]
vinserti32x4 ym20, [grain_lutq+top_offxyq+82 *1 ], 1
vbroadcasti32x4 m21, [grain_lutq+offxyq+82 *2 ]
vinserti32x4 m21, [grain_lutq+offxyq+82 *3 ], 3
punpcklbw ym19, ym20, ym18
punpckhbw ym20, ym18
%else
movu ym21, [grain_lutq+offxyq+82 *0 ]
vinserti32x8 m21, [grain_lutq+offxyq+82 *1 ], 1
movu ym20, [grain_lutq+top_offxyq+82 *0 ]
vinserti32x8 m20, [grain_lutq+top_offxyq+82 *1 ], 1
%endif
call %%add_noise_v
sub hb, 2 <<%2
jg %%loop_y
add wq, 32 >>%2
jge .end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r13+wq*(1 <<%2 )]
add srcq, wq
add dstq, wq
%%hv_overlap:
; we assume from the block above that bits 8-15 of r7d are zero'ed
mov r6d, seed
or seed, 0 xeff4eff4
test seeb, seeh
setp r7b ; parity of top_seed
shr seed, 16
shl r7d, 16
test seeb, seeh
setp r7b ; parity of cur_seed
or r6d, 0 x00010001
xor r7d, r6d
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
lea topleft_offxyd, [top_offxyq+(32 >>%2 )]
lea left_offxyd, [offyq+(32 >>%2 )]
rorx offyd, seed, 8
rorx offxd, seed, 12
and offyd, 0 x000f000f
and offxd, 0 x000f000f
imul offyd, 164 >>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyd, [offyq+offxq*(2 -%2 )+0 x10001*((3 +(6 >>%3 ))*82 +3 +(6 >>%2 ))+(32 >>%3 )*82 ]
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
mov grain_lutq, grain_lutmp
mov hd, hm
movzx top_offxyd, offxyw
shr offxyd, 16
%if %2
movu xm21, [grain_lutq+offxyq+82 *0 ]
movd xm16, [grain_lutq+left_offxyq+82 *0 ]
vinserti128 ym21, [grain_lutq+offxyq+82 *1 ], 1
vinserti128 ym16, [grain_lutq+left_offxyq+82 *1 ], 1
vinserti32x4 m21, [grain_lutq+offxyq+82 *2 ], 2
vinserti32x4 m16, [grain_lutq+left_offxyq+82 *2 ], 2
vinserti32x4 m21, [grain_lutq+offxyq+82 *3 ], 3
vinserti32x4 m16, [grain_lutq+left_offxyq+82 *3 ], 3
movd xm18, [grain_lutq+topleft_offxyq+82 *0 ]
movu xm20, [grain_lutq+top_offxyq]
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m16, m21
%if %3
punpcklbw xm18, xm20
%else
vinserti128 ym18, [grain_lutq+topleft_offxyq+82 *1 ], 1
vinserti128 ym20, [grain_lutq+top_offxyq+82 *1 ], 1
punpcklbw ym18, ym20
%endif
punpcklqdq m16, m18
pmaddubsw m16, m10, m16
pmulhrsw m16, m9
packsswb m16, m16
vmovdqu8 m21{k1}, m16
%if %3
vpalignr xm20{k1}, xm16, xm16, 4
punpcklbw xm19, xm20, xm21
punpckhbw xm20, xm21
%else
vpalignr ym20{k1}, ym16, ym16, 4
punpcklbw ym19, ym20, ym21
punpckhbw ym20, ym21
%endif
%else
movu ym21, [grain_lutq+offxyq+82 *0 ]
vinserti32x8 m21, [grain_lutq+offxyq+82 *1 ], 1
movd xm16, [grain_lutq+left_offxyq+82 *0 ]
vinserti32x4 m16, [grain_lutq+left_offxyq+82 *1 ], 2
movu ym20, [grain_lutq+top_offxyq+82 *0 ]
vinserti32x8 m20, [grain_lutq+top_offxyq+82 *1 ], 1
movd xm18, [grain_lutq+topleft_offxyq+82 *0 ]
vinserti32x4 m18, [grain_lutq+topleft_offxyq+82 *1 ], 2
punpcklbw m16, m21
punpcklbw m18, m20
punpcklqdq m16, m18
pmaddubsw m16, m10, m16
pmulhrsw m16, m9
packsswb m16, m16
vpalignr m20{k1}, m16, m16, 4
vmovdqu8 m21{k1}, m16
%endif
call %%add_noise_v
sub hb, 2 <<%2
jg %%loop_y_h_overlap
add wq, 32 >>%2
jge .end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r13+wq*(1 <<%2 )]
add srcq, wq
add dstq, wq
jmp %%hv_overlap
ALIGN function_align
%%add_noise_v:
%if %3
pmaddubsw xm19, xm13, xm19
pmaddubsw xm20, xm13, xm20
pmulhrsw xm19, xm9
pmulhrsw xm20, xm9
vpacksswb m21{k3}, m19, m20
%elif %2
pmaddubsw ym19, ym13, ym19
pmaddubsw ym20, ym13, ym20
pmulhrsw ym19, ym9
pmulhrsw ym20, ym9
vpacksswb m21{k3}, m19, m20
%else
punpcklbw m19, m20, m21
punpckhbw m20, m21
pmaddubsw m19, m13, m19
pmaddubsw m20, m13, m20
pmulhrsw m19, m9
pmulhrsw m20, m9
packsswb m21, m19, m20
%endif
%%add_noise:
punpcklbw m20, m5, m21
punpckhbw m21, m5
%%add_noise_h:
mova ym18, [lumaq+lstrideq*(0 <<%3 )]
vinserti32x8 m18, [lumaq+lstrideq*(1 <<%3 )], 1
%if %2
lea lumaq, [lumaq+lstrideq*(2 <<%3 )]
mova ym16, [lumaq+lstrideq*(0 <<%3 )]
vinserti32x8 m16, [lumaq+lstrideq*(1 <<%3 )], 1
mova xm17, [srcq+strideq*0 ]
mova m19, m11
vpermi2b m19, m18, m16
vinserti128 ym17, [srcq+strideq*1 ], 1
vpermt2b m18, m12, m16
vinserti32x4 m17, [srcq+strideq*2 ], 2
pavgb m18, m19
vinserti32x4 m17, [srcq+stride3q ], 3
%else
mova ym17, [srcq+strideq*0 ]
vinserti32x8 m17, [srcq+strideq*1 ], 1
%endif
%if %1
punpckhbw m19, m18, m17
punpcklbw m18, m17 ; { luma, chroma }
pmaddubsw m19, m14
pmaddubsw m18, m14
psraw m19, 6
psraw m18, 6
paddw m19, m15
paddw m18, m15
packuswb m18, m19
.add_noise_main:
mova m19, m0
vpermt2b m19, m18, m1 ; scaling[ 0..127]
vpmovb2m k2, m18
vpermi2b m18, m2, m3 ; scaling[128..255]
vmovdqu8 m19{k2}, m18 ; scaling[src]
pshufb m19, m4
pmaddubsw m18, m19, m20
pmaddubsw m19, m21
add grain_lutq, 82 *2 <<%2
lea lumaq, [lumaq+lstrideq*(2 <<%3 )]
lea srcq, [srcq+strideq*(2 <<%2 )]
pmulhrsw m18, m6 ; noise
pmulhrsw m19, m6
punpcklbw m16, m17, m5 ; chroma
punpckhbw m17, m5
paddw m16, m18
paddw m17, m19
packuswb m16, m17
pmaxub m16, m7
pminub m16, m8
%if %2
mova [dstq+strideq*0 ], xm16
vextracti128 [dstq+strideq*1 ], ym16, 1
vextracti32x4 [dstq+strideq*2 ], m16, 2
vextracti32x4 [dstq+stride3q ], m16, 3
%else
mova [dstq+strideq*0 ], ym16
vextracti32x8 [dstq+strideq*1 ], m16, 1
%endif
lea dstq, [dstq+strideq*(2 <<%2 )]
ret
%else
jmp .add_noise_main
%endif
%endmacro
%%FGUV_32x32xN_LOOP 1 , %2 , %3
.csfl:
%%FGUV_32x32xN_LOOP 0 , %2 , %3
.end :
RET
%endmacro
FGUV_FN 420 , 1 , 1
FGUV_FN 422 , 1 , 0
FGUV_FN 444 , 0 , 0
%endif ; ARCH_X86_64
Messung V0.5 in Prozent C=93 H=89 G=90
¤ Dauer der Verarbeitung: 0.12 Sekunden
(vorverarbeitet am 2026-06-04)
¤
*© Formatika GbR, Deutschland