/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/loongarch/loongson_asm.S"
/*
static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
const int bx4, const int bw4, int bh4)
*/
function splat_mv_lsx
vld vr0, a1, 0 // 0 1 ... 11 ...
clz.w t4, a3
vaddi.bu vr1, vr0, 0
addi.w t4, t4, -26
vextrins.w vr1, vr0, 0 x30 // 0 1 2 ... 11 0 1 2 3
la.local t5, .SPLAT_LSX_JRTABLE
vbsrl.v vr2, vr1, 4 // 4 5 6 7 ...11 0 1 2 3 0 0 0 0
alsl.d t6, t4, t5, 1
vextrins.w vr2, vr0, 0 x31 // 4 5 6 7 ...11 0 1 2 3 4 5 6 7
ld .h t7, t6, 0
vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0
add.d t8, t5, t7
alsl.d a2, a2, a2, 1
vextrins.w vr3, vr0, 0 x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11
slli.w a2, a2, 2
jirl $r0, t8, 0
.SPLAT_LSX_JRTABLE:
.hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE
.SPLAT_W1_LSX:
ld .d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
fst.d f1, t3, 0
fst.s f3, t3, 8
blt zero, a4, .SPLAT_W1_LSX
b .splat_end
.SPLAT_W2_LSX:
ld .d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
vst vr1, t3, 0
fst.d f2, t3, 16
blt zero, a4, .SPLAT_W2_LSX
b .splat_end
.SPLAT_W4_LSX:
ld .d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
vst vr1, t3, 0
vst vr2, t3, 16
vst vr3, t3, 32
blt zero, a4, .SPLAT_W4_LSX
b .splat_end
.SPLAT_W8_LSX:
ld .d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
vst vr1, t3, 0
vst vr2, t3, 16
vst vr3, t3, 32
vst vr1, t3, 48
vst vr2, t3, 64
vst vr3, t3, 80
blt zero, a4, .SPLAT_W8_LSX
b .splat_end
.SPLAT_W16_LSX:
ld .d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
.rept 2
vst vr1, t3, 0
vst vr2, t3, 16
vst vr3, t3, 32
vst vr1, t3, 48
vst vr2, t3, 64
vst vr3, t3, 80
addi.d t3, t3, 96
.endr
blt zero, a4, .SPLAT_W16_LSX
b .splat_end
.SPLAT_W32_LSX:
ld .d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
.rept 4
vst vr1, t3, 0
vst vr2, t3, 16
vst vr3, t3, 32
vst vr1, t3, 48
vst vr2, t3, 64
vst vr3, t3, 80
addi.d t3, t3, 96
.endr
blt zero, a4, .SPLAT_W32_LSX
.splat_end:
endfunc
const la_div_mult
.short 0 , 16384 , 8192 , 5461 , 4096 , 3276 , 2730 , 2340
.short 2048 , 1820 , 1638 , 1489 , 1365 , 1260 , 1170 , 1092
.short 1024 , 963 , 910 , 862 , 819 , 780 , 744 , 712
.short 682 , 655 , 630 , 606 , 585 , 564 , 546 , 528
endconst
/*
* temp reg: a6 a7
*/
.macro LOAD_SET_LOOP is_odd
slli.d a6, t6, 2
add.d a6, a6, t6 // col_w * 5
0 :
addi.d a7, zero, 0 // x
.if \is_odd
stx .w t7, t3, a7
addi.d a7, a7, 5
bge a7, a6, 2 f
.endif
1 :
stx .w t7, t3, a7
addi.d a7, a7, 5
stx .w t7, t3, a7
addi.d a7, a7, 5
blt a7, a6, 1 b
2 :
add.d t3, t3, t2
addi.d t5, t5, 1
blt t5, a5, 0 b
.endm
/*
* static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx,
* const int col_start8, const int col_end8,
* const int row_start8, int row_end8)
*/
function load_tmvs_lsx
addi.d sp, sp, -80
st .d s0, sp, 0
st .d s1, sp, 8
st .d s2, sp, 16
st .d s3, sp, 24
st .d s4, sp, 32
st .d s5, sp, 40
st .d s6, sp, 48
st .d s7, sp, 56
st .d s8, sp, 64
vld vr16, a0, 16
vld vr0, a0, 52 // rf->mfmv_ref
ld .w s8, a0, 152 // [0 ] - rf->n_mfmvs
vld vr17, a0, 168 // [0 ] - rp_ref| [1 ]- rp_proj
ld .d t1, a0, 184 // stride
ld .w t0, a0, 200
addi.w t0, t0, -1
bnez t0, 1 f
addi.w a1, zero, 0
1 :
addi.d t0, a3, 8
vinsgr2vr.w vr1, t0, 0
vinsgr2vr.w vr1, a5, 1
vmin.w vr1, vr1, vr16 // [0 ] col_end8i [1 ] row_end8
addi.d t0, a2, -8
bge t0, zero, 2 f
addi.w t0, zero, 0 // t0 col_start8i
2 :
vpickve2gr.d t4, vr17, 1 // rf->rp_proj
slli.d t2, t1, 2
add.d t2, t2, t1 // stride * 5
slli.d a1, a1, 4 // tile_row_idx * 16
andi t3, a4, 0 xf
add.d t3, t3, a1 // tile_row_idx * 16 + row_start8 & 15
mul.w t3, t3, t2
mul.w t8, a1, t2
vpickve2gr.w a5, vr1, 1
addi.d t5, a4, 0
sub .d t6, a3, a2 // col_end8 - col_start8
li.w t7, 0 x80008000
slli.d a7, a2, 2
add.d t3, t3, a2
add.d t3, t3, a7
add.d t3, t3, t4 // rp_proj
andi a6, t6, 1
bnez a6, 3 f
LOAD_SET_LOOP 0
b 4 f
3 :
LOAD_SET_LOOP 1
4 :
addi.d a6, zero, 0 // n
bge a6, s8, .end_load
add.d t3, t8, t4 // rp_proj
mul.w t6, a4, t2
addi.d s7, zero, 40
vpickve2gr.w t1, vr1, 0 // col_end8i
vbsrl.v vr2, vr0, 4 // rf->mfmv_ref2cur
addi.d t5, a0, 64 // rf->mfmv_ref2ref
la.local t8, la_div_mult
vld vr6, t8, 0
vld vr7, t8, 16
vld vr8, t8, 32
vld vr9, t8, 48
li.w t8, 0 x3fff
vreplgr2vr.h vr21, t8
vxor.v vr18, vr18, vr18 // zero
vsub.h vr20, vr18, vr21
vpickev.b vr12, vr7, vr6
vpickod.b vr13, vr7, vr6
vpickev.b vr14, vr9, vr8
vpickod.b vr15, vr9, vr8
vpickve2gr.d s6, vr17, 0 // rf->rp_ref
5 :
vld vr10, t5, 0
vld vr11, t5, 16
vpickev.h vr10, vr11, vr10
vpickev.b vr10, vr11, vr10 // [1 ...7 ]
vbsrl.v vr0, vr0, 1
vpickve2gr.wu t8, vr2, 0 // ref2cur
vbsrl.v vr2, vr2, 4
srli.d t4, t8, 24
xori t4, t4, 0 x80
beqz t4, 8 f
vreplgr2vr.h vr23, t8
vshuf.b vr6, vr14, vr12, vr10
vshuf.b vr7, vr15, vr13, vr10
vilvl.b vr8, vr7, vr6
vmulwev.w.h vr6, vr8, vr23
vmulwod.w.h vr7, vr8, vr23
vpickve2gr.b s0, vr0, 0 // ref
slli.d t8, s0, 3
ldx .d s1, s6, t8 // rf->rp_ref[ref]
addi.d s0, s0, -4 // ref_sign
vreplgr2vr.h vr19, s0
add.d s1, s1, t6 // &rf->rp_ref[ref][row_start8 * stride]
addi.d s2, a4, 0 // y
vilvl.w vr8, vr7, vr6
vilvh.w vr9, vr7, vr6
6 : // for (int y = row_start8;
andi s3, s2, 0 xff8
addi.d s4, s3, 8
blt a4, s3, 0 f
addi.d s3, a4, 0 // y_proj_start
0 :
blt s4, a5, 0 f
addi.d s4, a5, 0 // y_proj_end
0 :
addi.d s5, t0, 0 // x
7 : // for (int x = col_start8i;
slli.d a7, s5, 2
add.d a7, a7, s5
add.d a7, s1, a7 // rb
vld vr3, a7, 0 // [rb]
vpickve2gr.b t4, vr3, 4 // b_ref
beqz t4, .end_x
vreplve.b vr11, vr10, t4
vpickve2gr.b t7, vr11, 4 // ref2ref
beqz t7, .end_x
vsllwil.w.h vr4, vr3, 0
vreplgr2vr.w vr6, t4
vshuf.w vr6, vr9, vr8 // frac
vmul.w vr5, vr6, vr4
vsrai.w vr4, vr5, 31
vadd.w vr4, vr4, vr5
vssrarni.h.w vr4, vr4, 14
vclip.h vr4, vr4, vr20, vr21 // offset
vxor.v vr5, vr4, vr19 // offset.x ^ ref_sign
vori.b vr5, vr5, 0 x1 // offset.x ^ ref_sign
vabsd.h vr4, vr4, vr18
vsrli.h vr4, vr4, 6 // abs(offset.x) >> 6
vsigncov.h vr4, vr5, vr4 // apply_sign
vpickve2gr.h s0, vr4, 0
add.d s0, s2, s0 // pos_y
blt s0, s3, .n_posy
bge s0, s4, .n_posy
andi s0, s0, 0 xf
mul.w s0, s0, t2 // pos
vpickve2gr.h t7, vr4, 1
add.d t7, t7, s5 // pos_x
add.d s0, t3, s0 // rp_proj + pos
.loop_posx:
andi t4, s5, 0 xff8 // x_sb_align
blt t7, a2, .n_posx
addi.d t8, t4, -8
blt t7, t8, .n_posx
bge t7, a3, .n_posx
addi.d t4, t4, 16
bge t7, t4, .n_posx
slli.d t4, t7, 2
add.d t4, t4, t7 // pos_x * 5
add.d t4, s0, t4 // rp_proj[pos + pos_x]
vstelm.w vr3, t4, 0 , 0
vstelm.b vr11, t4, 4 , 4
.n_posx:
addi.d s5, s5, 1 // x + 1
bge s5, t1, .ret_posx
addi.d a7, a7, 5 // rb + 1
vld vr4, a7, 0 // [rb]
vseq.b vr5, vr4, vr3
vpickve2gr.d t8, vr5, 0
cto.d t8, t8
blt t8, s7, 7 b
addi.d t7, t7, 1 // pos_x + 1
/* Core computing loop expansion(sencond) */
andi t4, s5, 0 xff8 // x_sb_align
blt t7, a2, .n_posx
addi.d t8, t4, -8
blt t7, t8, .n_posx
bge t7, a3, .n_posx
addi.d t4, t4, 16
bge t7, t4, .n_posx
slli.d t4, t7, 2
add.d t4, t4, t7 // pos_x * 5
add.d t4, s0, t4 // rp_proj[pos + pos_x]
vstelm.w vr3, t4, 0 , 0
vstelm.b vr11, t4, 4 , 4
addi.d s5, s5, 1 // x + 1
bge s5, t1, .ret_posx
addi.d a7, a7, 5 // rb + 1
vld vr4, a7, 0 // [rb]
vseq.b vr5, vr4, vr3
vpickve2gr.d t8, vr5, 0
cto.d t8, t8
blt t8, s7, 7 b
addi.d t7, t7, 1 // pos_x + 1
/* Core computing loop expansion(third) */
andi t4, s5, 0 xff8 // x_sb_align
blt t7, a2, .n_posx
addi.d t8, t4, -8
blt t7, t8, .n_posx
bge t7, a3, .n_posx
addi.d t4, t4, 16
bge t7, t4, .n_posx
slli.d t4, t7, 2
add.d t4, t4, t7 // pos_x * 5
add.d t4, s0, t4 // rp_proj[pos + pos_x]
vstelm.w vr3, t4, 0 , 0
vstelm.b vr11, t4, 4 , 4
addi.d s5, s5, 1 // x + 1
bge s5, t1, .ret_posx
addi.d a7, a7, 5 // rb + 1
vld vr4, a7, 0 // [rb]
vseq.b vr5, vr4, vr3
vpickve2gr.d t8, vr5, 0
cto.d t8, t8
blt t8, s7, 7 b
addi.d t7, t7, 1 // pos_x + 1
b .loop_posx
.n_posy:
addi.d s5, s5, 1 // x + 1
bge s5, t1, .ret_posx
addi.d a7, a7, 5 // rb + 1
vld vr4, a7, 0 // [rb]
vseq.b vr5, vr4, vr3
vpickve2gr.d t8, vr5, 0
cto.d t8, t8
blt t8, s7, 7 b
addi.d s5, s5, 1 // x + 1
bge s5, t1, .ret_posx
addi.d a7, a7, 5 // rb + 1
vld vr4, a7, 0 // [rb]
vseq.b vr5, vr4, vr3
vpickve2gr.d t8, vr5, 0
cto.d t8, t8
blt t8, s7, 7 b
b .n_posy
.end_x:
addi.d s5, s5, 1 // x + 1
blt s5, t1, 7 b
.ret_posx:
add.d s1, s1, t2 // r + stride
addi.d s2, s2, 1 // y + 1
blt s2, a5, 6 b
8 :
addi.d a6, a6, 1 // n + 1
addi.d t5, t5, 28 // mfmv_ref2ref(offset) + 28
blt a6, s8, 5 b
.end_load:
ld .d s0, sp, 0
ld .d s1, sp, 8
ld .d s2, sp, 16
ld .d s3, sp, 24
ld .d s4, sp, 32
ld .d s5, sp, 40
ld .d s6, sp, 48
ld .d s7, sp, 56
ld .d s8, sp, 64
addi.d sp, sp, 80
endfunc
const mv_tbls
.byte 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255
.byte 0 , 1 , 2 , 3 , 8 , 0 , 1 , 2 , 3 , 8 , 0 , 1 , 2 , 3 , 8 , 0
.byte 4 , 5 , 6 , 7 , 9 , 4 , 5 , 6 , 7 , 9 , 4 , 5 , 6 , 7 , 9 , 4
.byte 4 , 5 , 6 , 7 , 9 , 4 , 5 , 6 , 7 , 9 , 4 , 5 , 6 , 7 , 9 , 4
endconst
const mask_mult
.byte 1 , 0 , 2 , 0 , 1 , 0 , 2 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
endconst
const mask_mv0
.byte 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16
endconst
const mask_mv1
.byte 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19
endconst
// void dav1d_save_tmvs_lsx(refmvs_temporal_block *rp, ptrdiff_t stride,
// refmvs_block **rr, const uint8_t *ref_sign,
// int col_end8, int row_end8,
// int col_start8, int row_start8)
function save_tmvs_lsx
addi.d sp, sp, -0 x28
st .d s0, sp, 0 x00
st .d s1, sp, 0 x08
st .d s2, sp, 0 x10
st .d s3, sp, 0 x18
st .d s4, sp, 0 x20
move t0, ra
vxor.v vr10, vr10, vr10
vld vr11, a3, 0 // Load ref_sign[0 ] ~ Load ref_sign[7 ]
la.local t2, .save_tevs_tbl
la.local s1, mask_mult
la.local t7, mv_tbls
vld vr9, s1, 0 // Load mask_mult
vslli.d vr11, vr11, 8 // 0 , ref_sign[0 ], ... ,ref_sign[6 ]
la.local s3, mask_mv0
vld vr8, s3, 0 // Load mask_mv0
la.local s4, mask_mv1
vld vr7, s4, 0 // Load mask_mv1
li.d s0, 5
li.d t8, 12 * 2
mul.d a1, a1, s0 // stride *= 5
sub .d a5, a5, a7 // h = row_end8 - row_start8
slli.d a7, a7, 1 // row_start8 <<= 1
1 :
li.d s0, 5
andi t3, a7, 30 // (y & 15 ) * 2
slli.d s4, t3, 3
ldx .d t3, a2, s4 // b = rr[(y & 15 ) * 2 ]
addi.d t3, t3, 12 // &b[... + 1 ]
mul.d s4, a4, t8
add.d t4, s4, t3 // end_cand_b = &b[col_end8*2 + 1 ]
mul.d s3, a6, t8
add.d t3, s3, t3 // cand_b = &b[x*2 + 1 ]
mul.d s4, a6, s0
add.d a3, s4, a0 // &rp[x]
2 :
/* First cand_b */
ld .b t5, t3, 10 // cand_b->bs
vld vr0, t3, 0 // cand_b->mv and ref
alsl.d t5, t5, t2, 2 // bt2 index
ld .h s3, t3, 8 // cand_b->ref
ld .h t6, t5, 0 // bt2
move s0, t2
alsl.d t3, t6, t3, 1 // Next cand_b += bt2 * 2
vor.v vr2, vr0, vr0
vinsgr2vr.h vr1, s3, 0
move t1 , t3
bge t3, t4, 3 f
/* Next cand_b */
ld .b s0, t3, 10 // cand_b->bs
vld vr4, t3, 0 // cand_b->mv and ref
alsl.d s0, s0, t2, 2 // bt2 index
ld .h s4, t3, 8 // cand_b->ref
ld .h t6, s0, 0 // bt2
alsl.d t3, t6, t3, 1 // Next cand_b += bt2*2
vpackev.d vr2, vr4, vr0 // a0.mv[0 ] a0.mv[1 ] a1.mv[0 ], a1.mv[1 ]
vinsgr2vr.h vr1, s4, 1 // a0.ref[0 ] a0.ref[1 ], a1.ref[0 ], a1.ref[1 ]
3 :
vabsd.h vr2, vr2, vr10 // abs(mv[].xy)
vsle.b vr16, vr10, vr1
vand.v vr1, vr16, vr1
vshuf.b vr1, vr11, vr11, vr1 // ref_sign[ref]
vsrli.h vr2, vr2, 12 // abs(mv[].xy) >> 12
vilvl.b vr1, vr1, vr1
vmulwev.h.bu vr1, vr1, vr9 // ef_sign[ref] * {1 , 2 }
vseqi.w vr2, vr2, 0 // abs(mv[].xy) <= 4096
vpickev.h vr2, vr2, vr2 // abs() condition to 16 bit
vand.v vr1, vr2, vr1 // h[0 -3 ] contains conditions for mv[0 -1 ]
vhaddw.wu.hu vr1, vr1, vr1 // Combine condition for [1 ] and [0 ]
vpickve2gr.wu s1, vr1, 0 // Extract case for first block
vpickve2gr.wu s2, vr1, 1
ld .hu t5, t5, 2 // Fetch jump table entry
ld .hu s0, s0, 2
alsl.d s3, s1, t7, 4 // Load permutation table base on case
vld vr1, s3, 0
alsl.d s4, s2, t7, 4
vld vr5, s4, 0
sub .d t5, t2, t5 // Find jump table target
sub .d s0, t2, s0
vshuf.b vr0, vr0, vr0, vr1 // Permute cand_b to output refmvs_temporal_block
vshuf.b vr4, vr4, vr4, vr5
vsle.b vr16, vr10, vr1
vand.v vr0, vr16, vr0
vsle.b vr17, vr10, vr5
vand.v vr4, vr17, vr4
// v1 follows on v0, with another 3 full repetitions of the pattern.
vshuf.b vr1, vr0, vr0, vr8 // 1 , 2 , 3 , ... , 15 , 16
vshuf.b vr5, vr4, vr4, vr8 // 1 , 2 , 3 , ... , 15 , 16
// v2 ends with 3 complete repetitions of the pattern.
vshuf.b vr2, vr1, vr0, vr7
vshuf.b vr6, vr5, vr4, vr7 // 4 , 5 , 6 , 7 , ... , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19
jirl ra, t5, 0
bge t1 , t4, 4 f // if (cand_b >= end)
vor.v vr0, vr4, vr4
vor.v vr1, vr5, vr5
vor.v vr2, vr6, vr6
jirl ra, s0, 0
blt t3, t4, 2 b // if (cand_b < end)
4 :
addi.d a5, a5, -1 // h--
addi.d a7, a7, 2 // y += 2
add.d a0, a0, a1 // rp += stride
blt zero, a5, 1 b
ld .d s0, sp, 0 x00
ld .d s1, sp, 0 x08
ld .d s2, sp, 0 x10
ld .d s3, sp, 0 x18
ld .d s4, sp, 0 x20
addi.d sp, sp, 0 x28
move ra, t0
jirl zero, ra, 0 x00
10 :
addi.d s1, a3, 4
vstelm.w vr0, a3, 0 , 0 // .mv
vstelm.b vr0, s1, 0 , 4 // .ref
addi.d a3, a3, 5
jirl zero, ra, 0 x00
20 :
addi.d s1, a3, 8
vstelm.d vr0, a3, 0 , 0 // .mv
vstelm.h vr0, s1, 0 , 4 // .ref
addi.d a3, a3, 2 * 5
jirl zero, ra, 0 x00
40 :
vst vr0, a3, 0
vstelm.w vr1, a3, 0 x10, 0
addi.d a3, a3, 4 * 5
jirl zero, ra, 0 x00
80 :
vst vr0, a3, 0
vst vr1, a3, 0 x10 // This writes 6 full entries plus 2 extra bytes
vst vr2, a3, 5 * 8 - 16 // Write the last few, overlapping with the first write.
addi.d a3, a3, 8 * 5
jirl zero, ra, 0 x00
160 :
addi.d s1, a3, 6 * 5
addi.d s2, a3, 12 * 5
vst vr0, a3, 0
vst vr1, a3, 0 x10 // This writes 6 full entries plus 2 extra bytes
vst vr0, a3, 6 * 5
vst vr1, a3, 6 * 5 + 16 // Write another 6 full entries, slightly overlapping with the first set
vstelm.d vr0, s2, 0 , 0 // Write 8 bytes (one full entry) after the first 12
vst vr2, a3, 5 * 16 - 16 // Write the last 3 entries
addi.d a3, a3, 16 * 5
jirl zero, ra, 0 x00
.save_tevs_tbl:
.hword 16 * 12 // bt2 * 12 , 12 is sizeof(refmvs_block)
.hword .save_tevs_tbl - 160 b
.hword 16 * 12
.hword .save_tevs_tbl - 160 b
.hword 8 * 12
.hword .save_tevs_tbl - 80 b
.hword 8 * 12
.hword .save_tevs_tbl - 80 b
.hword 8 * 12
.hword .save_tevs_tbl - 80 b
.hword 8 * 12
.hword .save_tevs_tbl - 80 b
.hword 4 * 12
.hword .save_tevs_tbl - 40 b
.hword 4 * 12
.hword .save_tevs_tbl - 40 b
.hword 4 * 12
.hword .save_tevs_tbl - 40 b
.hword 4 * 12
.hword .save_tevs_tbl - 40 b
.hword 2 * 12
.hword .save_tevs_tbl - 20 b
.hword 2 * 12
.hword .save_tevs_tbl - 20 b
.hword 2 * 12
.hword .save_tevs_tbl - 20 b
.hword 2 * 12
.hword .save_tevs_tbl - 20 b
.hword 2 * 12
.hword .save_tevs_tbl - 20 b
.hword 1 * 12
.hword .save_tevs_tbl - 10 b
.hword 1 * 12
.hword .save_tevs_tbl - 10 b
.hword 1 * 12
.hword .save_tevs_tbl - 10 b
.hword 1 * 12
.hword .save_tevs_tbl - 10 b
.hword 1 * 12
.hword .save_tevs_tbl - 10 b
.hword 1 * 12
.hword .save_tevs_tbl - 10 b
.hword 1 * 12
.hword .save_tevs_tbl - 10 b
endfunc
Messung V0.5 in Prozent C=91 H=78 G=84