Quelle pixman-arm-simd-asm.S Sprache: Sparc

/*
* Copyright © 2012 Raspberry Pi Foundation
* Copyright © 2012 RISC OS Open Ltd
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of the copyright holders not be used in
* advertising or publicity pertaining to distribution of the software without
* specific, written prior permission.  The copyright holders make no
* representations about the suitability of this software for any purpose.  It
* is provided "as is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
* Author:  Ben Avison (bavison@riscosopen.org)
*
*/

/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

.text
.arch armv6
.object_arch armv4
.arm
.altmacro
.p2align 2

#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"

pixman_syntax_unified

/* A head macro should do all processing which results in an output of up to
* 16 bytes, as far as the final load instruction. The corresponding tail macro
* should complete the processing of the up-to-16 bytes. The calling macro will
* sometimes choose to insert a preload or a decrement of X between them.
*   cond           ARM condition code for code block
*   numbytes       Number of output bytes that should be generated this time
*   firstreg       First WK register in which to place output
*   unaligned_src  Whether to use non-wordaligned loads of source image
*   unaligned_mask Whether to use non-wordaligned loads of mask image
*   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
*/

.macro blit_init
        line_saved_regs STRIDE_D, STRIDE_S
.endm

.macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
.endm

.macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
    WK4     .req    STRIDE_D
    WK5     .req    STRIDE_S
    WK6     .req    MASK
    WK7     .req    STRIDE_M
110:    pixld   , 16, 0, SRC, \unaligned_src
        pixld   , 16, 4, SRC, \unaligned_src
        pld     [SRC, SCRATCH]
        pixst   , 16, 0, DST
        pixst   , 16, 4, DST
        subs    X, X, #32*8/src_bpp
        bhs     110b
    .unreq  WK4
    .unreq  WK5
    .unreq  WK6
    .unreq  WK7
.endm

generate_composite_function \
    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
    4, /* prefetch distance */ \
    blit_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    blit_process_head, \
    nop_macro, /* process tail */ \
    blit_inner_loop

generate_composite_function \
    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
    4, /* prefetch distance */ \
    blit_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    blit_process_head, \
    nop_macro, /* process tail */ \
    blit_inner_loop

generate_composite_function \
    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
    3, /* prefetch distance */ \
    blit_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    blit_process_head, \
    nop_macro, /* process tail */ \
    blit_inner_loop

/******************************************************************************/

.macro src_n_8888_init
        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
.endm

.macro src_n_0565_init
        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
        orr     SRC, SRC, SRC, lsl #16
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
.endm

.macro src_n_8_init
        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
        orr     SRC, SRC, SRC, lsl #8
        orr     SRC, SRC, SRC, lsl #16
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
.endm

.macro fill_process_tail  cond, numbytes, firstreg
    WK4     .req    SRC
    WK5     .req    STRIDE_S
    WK6     .req    MASK
    WK7     .req    STRIDE_M
        pixst   \cond, \numbytes, 4, DST
    .unreq  WK4
    .unreq  WK5
    .unreq  WK6
    .unreq  WK7
.endm

generate_composite_function \
    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    src_n_8888_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    fill_process_tail

generate_composite_function \
    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    src_n_0565_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    fill_process_tail

generate_composite_function \
    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    src_n_8_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    fill_process_tail

/******************************************************************************/

.macro src_x888_8888_pixel, cond, reg
        orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
.endm

.macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
.endm

.macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
        src_x888_8888_pixel \cond, %(\firstreg+0)
.if \numbytes >= 8
        src_x888_8888_pixel \cond, %(\firstreg+1)
  .if \numbytes == 16
        src_x888_8888_pixel \cond, %(\firstreg+2)
        src_x888_8888_pixel \cond, %(\firstreg+3)
  .endif
.endif
.endm

generate_composite_function \
    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
    3, /* prefetch distance */ \
    nop_macro, /* init */ \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    pixman_composite_src_x888_8888_process_head, \
    pixman_composite_src_x888_8888_process_tail

/******************************************************************************/

.macro src_0565_8888_init
        /* Hold loop invariants in MASK and STRIDE_M */
        ldr     MASK, =0x07E007E0
        mov     STRIDE_M, #0xFF000000
        /* Set GE[3:0] to 1010 so SEL instructions do what we want */
        ldr     SCRATCH, =0x80008000
        uadd8   SCRATCH, SCRATCH, SCRATCH
.endm

.macro src_0565_8888_2pixels, reg1, reg2
        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
        bic     WK\()\reg2, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
        mov     WK\()\reg1, WK\()\reg2, lsl #16             @ rrrrr000000bbbbb0000000000000000
        mov     SCRATCH, SCRATCH, ror #19                   @ GGGG0000ggggggggggg00000GGGGGGGG
        bic     WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
        pkhtb   WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
        sel     WK\()\reg1, WK\()\reg1, SCRATCH             @ rrrrrrrrggggggggbbbbbbbb--------
        mov     SCRATCH, SCRATCH, ror #16                   @ ggg00000GGGGGGGGGGGG0000gggggggg
        pkhtb   WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
        sel     WK\()\reg2, WK\()\reg2, SCRATCH             @ RRRRRRRRGGGGGGGGBBBBBBBB--------
        orr     WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8    @ 11111111rrrrrrrrggggggggbbbbbbbb
        orr     WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8    @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
.endm

/* This version doesn't need STRIDE_M, but is one instruction longer.
   It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
        bic     WK\()\reg1, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
        mov     WK\()\reg2, WK\()\reg1, lsr #16             @ 0000000000000000RRRRR000000BBBBB
        mov     SCRATCH, SCRATCH, ror #27                   @ GGGGGGGGGGGG0000ggggggggggg00000
        bic     WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
        mov     WK\()\reg2, WK\()\reg2, lsl #3              @ 0000000000000RRRRR000000BBBBB000
        mov     WK\()\reg1, WK\()\reg1, lsl #3              @ 0000000000000rrrrr000000bbbbb000
        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
        pkhbt   WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
        pkhbt   WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
        sel     WK\()\reg2, SCRATCH, WK\()\reg2             @ --------RRRRRRRRGGGGGGGGBBBBBBBB
        sel     WK\()\reg1, SCRATCH, WK\()\reg1             @ --------rrrrrrrrggggggggbbbbbbbb
        orr     WK\()\reg2, WK\()\reg2, #0xFF000000         @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
        orr     WK\()\reg1, WK\()\reg1, #0xFF000000         @ 11111111rrrrrrrrggggggggbbbbbbbb
*/

.macro src_0565_8888_1pixel, reg
        bic     SCRATCH, WK\()\reg, MASK                 @ 0000000000000000rrrrr000000bbbbb
        and     WK\()\reg, WK\()\reg, MASK               @ 000000000000000000000gggggg00000
        mov     SCRATCH, SCRATCH, lsl #3                 @ 0000000000000rrrrr000000bbbbb000
        mov     WK\()\reg, WK\()\reg, lsl #5             @ 0000000000000000gggggg0000000000
        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5        @ 0000000000000rrrrrrrrrr0bbbbbbbb
        orr     WK\()\reg, WK\()\reg, WK\()\reg, lsr #6  @ 000000000000000gggggggggggg00000
        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5        @ --------rrrrrrrr--------bbbbbbbb
        sel     WK\()\reg, WK\()\reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
        orr     WK\()\reg, WK\()\reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
.endm

.macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
.if \numbytes == 16
        pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
.elseif \numbytes == 8
        pixld   , 4, \firstreg, SRC, \unaligned_src
.elseif \numbytes == 4
        pixld   , 2, \firstreg, SRC, \unaligned_src
.endif
.endm

.macro src_0565_8888_process_tail   cond, numbytes, firstreg
.if \numbytes == 16
        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
        src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
.elseif \numbytes == 8
        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
.else
        src_0565_8888_1pixel \firstreg
.endif
.endm

generate_composite_function \
    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
    3, /* prefetch distance */ \
    src_0565_8888_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    src_0565_8888_process_head, \
    src_0565_8888_process_tail

/******************************************************************************/

.macro src_x888_0565_init
        /* Hold loop invariant in MASK */
        ldr     MASK, =0x001F001F
        line_saved_regs  STRIDE_S, ORIG_W
.endm

.macro src_x888_0565_1pixel  s, d
        and     WK\()\d, MASK, WK\()\s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
        and     STRIDE_S, WK\()\s, #0xFC00               @ 0000000000000000gggggg0000000000
        orr     WK\()\d, WK\()\d, WK\()\d, lsr #5        @ 00000000000-----rrrrr000000bbbbb
        orr     WK\()\d, WK\()\d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
        /* Top 16 bits are discarded during the following STRH */
.endm

.macro src_x888_0565_2pixels  slo, shi, d, tmp
        and     SCRATCH, WK\()\shi, #0xFC00                 @ 0000000000000000GGGGGG0000000000
        and     WK\()\tmp, MASK, WK\()\shi, lsr #3          @ 00000000000RRRRR00000000000BBBBB
        and     WK\()\shi, MASK, WK\()\slo, lsr #3          @ 00000000000rrrrr00000000000bbbbb
        orr     WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
        orr     WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5       @ 00000000000-----RRRRRGGGGGGBBBBB
        and     SCRATCH, WK\()\slo, #0xFC00                 @ 0000000000000000gggggg0000000000
        orr     WK\()\shi, WK\()\shi, WK\()\shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
        orr     WK\()\shi, WK\()\shi, SCRATCH, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
        pkhbt   WK\()\d, WK\()\shi, WK\()\tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
.endm

.macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        WK4     .req    STRIDE_S
        WK5     .req    STRIDE_M
        WK6     .req    WK3
        WK7     .req    ORIG_W
.if \numbytes == 16
        pixld   , 16, 4, SRC, 0
        src_x888_0565_2pixels  4, 5, 0, 0
        pixld   , 8, 4, SRC, 0
        src_x888_0565_2pixels  6, 7, 1, 1
        pixld   , 8, 6, SRC, 0
.else
        pixld   , \numbytes*2, 4, SRC, 0
.endif
.endm

.macro src_x888_0565_process_tail   cond, numbytes, firstreg
.if \numbytes == 16
        src_x888_0565_2pixels  4, 5, 2, 2
        src_x888_0565_2pixels  6, 7, 3, 4
.elseif \numbytes == 8
        src_x888_0565_2pixels  4, 5, 1, 1
        src_x888_0565_2pixels  6, 7, 2, 2
.elseif \numbytes == 4
        src_x888_0565_2pixels  4, 5, 1, 1
.else
        src_x888_0565_1pixel  4, 1
.endif
.if \numbytes == 16
        pixst   , \numbytes, 0, DST
.else
        pixst   , \numbytes, 1, DST
.endif
        .unreq  WK4
        .unreq  WK5
        .unreq  WK6
        .unreq  WK7
.endm

generate_composite_function \
    pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
    3, /* prefetch distance */ \
    src_x888_0565_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    src_x888_0565_process_head, \
    src_x888_0565_process_tail

/******************************************************************************/

.macro add_8_8_8pixels  cond, dst1, dst2
        uqadd8\()\cond  WK\()\dst1, WK\()\dst1, MASK
        uqadd8\()\cond  WK\()\dst2, WK\()\dst2, STRIDE_M
.endm

.macro add_8_8_4pixels  cond, dst
        uqadd8\()\cond  WK\()\dst, WK\()\dst, MASK
.endm

.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    WK4     .req    MASK
    WK5     .req    STRIDE_M
.if \numbytes == 16
        pixld   \cond, 8, 4, SRC, \unaligned_src
        pixld   \cond, 16, \firstreg, DST, 0
        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
        pixld   \cond, 8, 4, SRC, \unaligned_src
.else
        pixld   \cond, \numbytes, 4, SRC, \unaligned_src
        pixld   \cond, \numbytes, \firstreg, DST, 0
.endif
    .unreq  WK4
    .unreq  WK5
.endm

.macro add_8_8_process_tail  cond, numbytes, firstreg
.if \numbytes == 16
        add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
.elseif \numbytes == 8
        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
.else
        add_8_8_4pixels \cond, \firstreg
.endif
.endm

generate_composite_function \
    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
    2, /* prefetch distance */ \
    nop_macro, /* init */ \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    add_8_8_process_head, \
    add_8_8_process_tail

/******************************************************************************/

.macro over_8888_8888_init
        /* Hold loop invariant in MASK */
        ldr     MASK, =0x00800080
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        uadd8   SCRATCH, MASK, MASK
        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
.endm

.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    WK4     .req    STRIDE_D
    WK5     .req    STRIDE_S
    WK6     .req    STRIDE_M
    WK7     .req    ORIG_W
        pixld   , \numbytes, %(4+\firstreg), SRC, \unaligned_src
        pixld   , \numbytes, \firstreg, DST, 0
    .unreq  WK4
    .unreq  WK5
    .unreq  WK6
    .unreq  WK7
.endm

.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
        teq     WK\()\reg0, #0
.if \numbytes > 4
        teqeq   WK\()\reg1, #0
  .if \numbytes > 8
        teqeq   WK\()\reg2, #0
        teqeq   WK\()\reg3, #0
  .endif
.endif
.endm

.macro over_8888_8888_prepare  next
        mov     WK\()\next, WK\()\next, lsr #24
.endm

.macro over_8888_8888_1pixel src, dst, offset, next
        /* src = destination component multiplier */
        rsb     WK\()\src, WK\()\src, #255
        /* Split even/odd bytes of dst into SCRATCH/dst */
        uxtb16  SCRATCH, WK\()\dst
        uxtb16  WK\()\dst, WK\()\dst, ror #8
        /* Multiply through, adding 0.5 to the upper byte of result for rounding */
        mla     SCRATCH, SCRATCH, WK\()\src, MASK
        mla     WK\()\dst, WK\()\dst, WK\()\src, MASK
        /* Where we would have had a stall between the result of the first MLA and the shifter input,
         * reload the complete source pixel */
        ldr     WK\()\src, [SRC, #\offset]
        /* Multiply by 257/256 to approximate 256/255 */
        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
        /* In this stall, start processing the next pixel */
.if \offset < -4
        mov     WK\()\next, WK\()\next, lsr #24
.endif
        uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
        /* Recombine even/odd bytes of multiplied destination */
        mov     SCRATCH, SCRATCH, ror #8
        sel     WK\()\dst, SCRATCH, WK\()\dst
        /* Saturated add of source to multiplied destination */
        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
.endm

.macro over_8888_8888_process_tail  cond, numbytes, firstreg
    WK4     .req    STRIDE_D
    WK5     .req    STRIDE_S
    WK6     .req    STRIDE_M
    WK7     .req    ORIG_W
        over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
        beq     10f
        over_8888_8888_prepare  %(4+\firstreg)
.set PROCESS_REG, \firstreg
.set PROCESS_OFF, -\numbytes
.rept \numbytes / 4
        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
  .set PROCESS_REG, PROCESS_REG+1
  .set PROCESS_OFF, PROCESS_OFF+4
.endr
        pixst   , \numbytes, \firstreg, DST
10:
    .unreq  WK4
    .unreq  WK5
    .unreq  WK6
    .unreq  WK7
.endm

generate_composite_function \
    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
    2, /* prefetch distance */ \
    over_8888_8888_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    over_8888_8888_process_head, \
    over_8888_8888_process_tail

/******************************************************************************/

/* Multiply each byte of a word by a byte.
* Useful when there aren't any obvious ways to fill the stalls with other instructions.
* word  Register containing 4 bytes
* byte  Register containing byte multiplier (bits 8-31 must be 0)
* tmp   Scratch register
* half  Register containing the constant 0x00800080
* GE[3:0] bits must contain 0101
*/
.macro mul_8888_8  word, byte, tmp, half
        /* Split even/odd bytes of word apart */
        uxtb16  \tmp, \word
        uxtb16  \word, \word, ror #8
        /* Multiply bytes together with rounding, then by 257/256 */
        mla     \tmp, \tmp, \byte, \half
        mla     \word, \word, \byte, \half /* 1 stall follows */
        uxtab16 \tmp, \tmp, \tmp, ror #8  /* 1 stall follows */
        uxtab16 \word, \word, \word, ror #8
        /* Recombine bytes */
        mov     \tmp, \tmp, ror #8
        sel     \word, \tmp, \word
.endm

/******************************************************************************/

.macro over_8888_n_8888_init
        /* Mask is constant */
        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
        /* Hold loop invariant in STRIDE_M */
        ldr     STRIDE_M, =0x00800080
        /* We only want the alpha bits of the constant mask */
        mov     MASK, MASK, lsr #24
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        uadd8   SCRATCH, STRIDE_M, STRIDE_M
        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
.endm

.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    WK4     .req    Y
    WK5     .req    STRIDE_D
    WK6     .req    STRIDE_S
    WK7     .req    ORIG_W
        pixld   , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
        pixld   , \numbytes, \firstreg, DST, 0
    .unreq  WK4
    .unreq  WK5
    .unreq  WK6
    .unreq  WK7
.endm

.macro over_8888_n_8888_1pixel src, dst
        mul_8888_8  WK\()\src, MASK, SCRATCH, STRIDE_M
        sub     WK7, WK6, WK\()\src, lsr #24
        mul_8888_8  WK\()\dst, WK7, SCRATCH, STRIDE_M
        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
.endm

.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
    WK4     .req    Y
    WK5     .req    STRIDE_D
    WK6     .req    STRIDE_S
    WK7     .req    ORIG_W
        over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
        beq     10f
        mov     WK6, #255
.set PROCESS_REG, \firstreg
.rept \numbytes / 4
  .if \numbytes == 16 && PROCESS_REG == 2
        /* We're using WK6 and WK7 as temporaries, so half way through
         * 4 pixels, reload the second two source pixels but this time
         * into WK4 and WK5 */
        ldmdb   SRC, {WK4, WK5}
  .endif
        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
  .set PROCESS_REG, PROCESS_REG+1
.endr
        pixst   , \numbytes, \firstreg, DST
10:
    .unreq  WK4
    .unreq  WK5
    .unreq  WK6
    .unreq  WK7
.endm

generate_composite_function \
    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
    2, /* prefetch distance */ \
    over_8888_n_8888_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    over_8888_n_8888_process_head, \
    over_8888_n_8888_process_tail

/******************************************************************************/

.macro over_n_8_8888_init
        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
        ldr     SCRATCH, =0x00800080
        uxtb16  STRIDE_S, SRC
        uxtb16  SRC, SRC, ror #8
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        uadd8   SCRATCH, SCRATCH, SCRATCH
        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
.endm

.macro over_n_8_8888_newline
        ldr     STRIDE_D, =0x00800080
        b       1f
.ltorg
1:
.endm

.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    WK4     .req    STRIDE_M
        pixld   , \numbytes/4, 4, MASK, \unaligned_mask
        pixld   , \numbytes, \firstreg, DST, 0
    .unreq  WK4
.endm

.macro over_n_8_8888_1pixel src, dst
        uxtb    Y, WK4, ror #\src*8
        /* Trailing part of multiplication of source */
        mla     SCRATCH, STRIDE_S, Y, STRIDE_D
        mla     Y, SRC, Y, STRIDE_D
        mov     ORIG_W, #255
        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
        uxtab16 Y, Y, Y, ror #8
        mov     SCRATCH, SCRATCH, ror #8
        sub     ORIG_W, ORIG_W, Y, lsr #24
        sel     Y, SCRATCH, Y
        /* Then multiply the destination */
        mul_8888_8  WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
        uqadd8  WK\()\dst, WK\()\dst, Y
.endm

.macro over_n_8_8888_process_tail  cond, numbytes, firstreg
    WK4     .req    STRIDE_M
        teq     WK4, #0
        beq     10f
.set PROCESS_REG, \firstreg
.rept \numbytes / 4
        over_n_8_8888_1pixel  %(PROCESS_REG-\firstreg), %(PROCESS_REG)
  .set PROCESS_REG, PROCESS_REG+1
.endr
        pixst   , \numbytes, \firstreg, DST
10:
    .unreq  WK4
.endm

generate_composite_function \
    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
    2, /* prefetch distance */ \
    over_n_8_8888_init, \
    over_n_8_8888_newline, \
    nop_macro, /* cleanup */ \
    over_n_8_8888_process_head, \
    over_n_8_8888_process_tail

/******************************************************************************/

.macro over_reverse_n_8888_init
        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
        ldr     MASK, =0x00800080
        /* Split source pixel into RB/AG parts */
        uxtb16  STRIDE_S, SRC
        uxtb16  STRIDE_M, SRC, ror #8
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        uadd8   SCRATCH, MASK, MASK
        line_saved_regs  STRIDE_D, ORIG_W
.endm

.macro over_reverse_n_8888_newline
        mov     STRIDE_D, #0xFF
.endm

.macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   , \numbytes, \firstreg, DST, 0
.endm

.macro over_reverse_n_8888_1pixel  d, is_only
        teq     WK\()\d, #0
        beq     8f       /* replace with source */
        bics    ORIG_W, STRIDE_D, WK\()\d, lsr #24
.if \is_only == 1
        beq     49f      /* skip store */
.else
        beq     9f       /* write same value back */
.endif
        mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
        mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */
        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
        uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
        mov     SCRATCH, SCRATCH, ror #8
        sel     ORIG_W, SCRATCH, ORIG_W
        uqadd8  WK\()\d, WK\()\d, ORIG_W
        b       9f
8:      mov     WK\()\d, SRC
9:
.endm

.macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
.if \numbytes == 4
        over_reverse_n_8888_1pixel  \reg1, 1
.else
        and     SCRATCH, WK\()\reg1, WK\()\reg2
  .if \numbytes == 16
        and     SCRATCH, SCRATCH, WK\()\reg3
        and     SCRATCH, SCRATCH, WK\()\reg4
  .endif
        mvns    SCRATCH, SCRATCH, asr #24
        beq     49f /* skip store if all opaque */
        over_reverse_n_8888_1pixel  \reg1, 0
        over_reverse_n_8888_1pixel  \reg2, 0
  .if \numbytes == 16
        over_reverse_n_8888_1pixel  \reg3, 0
        over_reverse_n_8888_1pixel  \reg4, 0
  .endif
.endif
        pixst   , \numbytes, \reg1, DST
49:
.endm

.macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
        over_reverse_n_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
.endm

generate_composite_function \
    pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
    3, /* prefetch distance */ \
    over_reverse_n_8888_init, \
    over_reverse_n_8888_newline, \
    nop_macro, /* cleanup */ \
    over_reverse_n_8888_process_head, \
    over_reverse_n_8888_process_tail

/******************************************************************************/

.macro over_white_8888_8888_ca_init
        HALF    .req    SRC
        TMP0    .req    STRIDE_D
        TMP1    .req    STRIDE_S
        TMP2    .req    STRIDE_M
        TMP3    .req    ORIG_W
        WK4     .req    SCRATCH
        line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
        ldr     SCRATCH, =0x800080
        mov     HALF, #0x80
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        uadd8   SCRATCH, SCRATCH, SCRATCH
        .set DST_PRELOAD_BIAS, 8
.endm

.macro over_white_8888_8888_ca_cleanup
        .set DST_PRELOAD_BIAS, 0
        .unreq  HALF
        .unreq  TMP0
        .unreq  TMP1
        .unreq  TMP2
        .unreq  TMP3
        .unreq  WK4
.endm

.macro over_white_8888_8888_ca_combine  m, d
        uxtb16  TMP1, TMP0                /* rb_notmask */
        uxtb16  TMP2, \d                  /* rb_dest; 1 stall follows */
        smlatt  TMP3, TMP2, TMP1, HALF    /* red */
        smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
        uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
        uxtb16  TMP1, \d, ror #8          /* ag_dest; 1 stall follows */
        smlatt  \d, TMP1, TMP0, HALF      /* alpha */
        smlabb  TMP1, TMP1, TMP0, HALF    /* green */
        pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
        pkhbt   TMP1, TMP1, \d, lsl #16   /* ag */
        uxtab16 TMP0, TMP0, TMP0, ror #8
        uxtab16 TMP1, TMP1, TMP1, ror #8
        mov     TMP0, TMP0, ror #8
        sel     \d, TMP0, TMP1
        uqadd8  \d, \d, \m                 /* d is a late result */
.endm

.macro over_white_8888_8888_ca_1pixel_head
        pixld   , 4, 1, MASK, 0
        pixld   , 4, 3, DST, 0
.endm

.macro over_white_8888_8888_ca_1pixel_tail
        mvn     TMP0, WK1
        teq     WK1, WK1, asr #32
        bne     1f
        bcc     3f
        mov     WK3, WK1
        b       2f
1:      over_white_8888_8888_ca_combine WK1, WK3
2:      pixst   , 4, 3, DST
3:
.endm

.macro over_white_8888_8888_ca_2pixels_head
        pixld   , 8, 1, MASK, 0
.endm

.macro over_white_8888_8888_ca_2pixels_tail
        pixld   , 8, 3, DST
        mvn     TMP0, WK1
        teq     WK1, WK1, asr #32
        bne     1f
        movcs   WK3, WK1
        bcs     2f
        teq     WK2, #0
        beq     5f
        b       2f
1:      over_white_8888_8888_ca_combine WK1, WK3
2:      mvn     TMP0, WK2
        teq     WK2, WK2, asr #32
        bne     3f
        movcs   WK4, WK2
        b       4f
3:      over_white_8888_8888_ca_combine WK2, WK4
4:      pixst   , 8, 3, DST
5:
.endm

.macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
.if \numbytes == 4
        over_white_8888_8888_ca_1pixel_head
.else
  .if \numbytes == 16
        over_white_8888_8888_ca_2pixels_head
        over_white_8888_8888_ca_2pixels_tail
  .endif
        over_white_8888_8888_ca_2pixels_head
.endif
.endm

.macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
.if \numbytes == 4
        over_white_8888_8888_ca_1pixel_tail
.else
        over_white_8888_8888_ca_2pixels_tail
.endif
.endm

generate_composite_function \
    pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
    2, /* prefetch distance */ \
    over_white_8888_8888_ca_init, \
    nop_macro, /* newline */ \
    over_white_8888_8888_ca_cleanup, \
    over_white_8888_8888_ca_process_head, \
    over_white_8888_8888_ca_process_tail

.macro over_n_8888_8888_ca_init
        /* Set up constants. RB_SRC and AG_SRC are in registers;
         * RB_FLDS, A_SRC, and the two HALF values need to go on the
         * stack (and the ful SRC value is already there) */
        ldr     SCRATCH, [sp, #ARGS_STACK_OFFSET]
        mov     WK0, #0x00FF0000
        orr     WK0, WK0, #0xFF        /* RB_FLDS (0x00FF00FF) */
        mov     WK1, #0x80             /* HALF default value */
        mov     WK2, SCRATCH, lsr #24  /* A_SRC */
        orr     WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
        push    {WK0-WK3}
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
        uxtb16  SRC, SCRATCH
        uxtb16  STRIDE_S, SCRATCH, ror #8

        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        uadd8   SCRATCH, WK3, WK3

        .unreq  WK0
        .unreq  WK1
        .unreq  WK2
        .unreq  WK3
        WK0     .req    Y
        WK1     .req    STRIDE_D
        RB_SRC  .req    SRC
        AG_SRC  .req    STRIDE_S
        WK2     .req    STRIDE_M
        RB_FLDS .req    r8       /* the reloaded constants have to be at consecutive registers starting at an even one */
        A_SRC   .req    r8
        HALF    .req    r9
        WK3     .req    r10
        WK4     .req    r11
        WK5     .req    SCRATCH
        WK6     .req    ORIG_W

        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
.endm

.macro over_n_8888_8888_ca_cleanup
        add     sp, sp, #16
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16

        .unreq  WK0
        .unreq  WK1
        .unreq  RB_SRC
        .unreq  AG_SRC
        .unreq  WK2
        .unreq  RB_FLDS
        .unreq  A_SRC
        .unreq  HALF
        .unreq  WK3
        .unreq  WK4
        .unreq  WK5
        .unreq  WK6
        WK0     .req    r8
        WK1     .req    r9
        WK2     .req    r10
        WK3     .req    r11
.endm

.macro over_n_8888_8888_ca_1pixel_head
        pixld   , 4, 6, MASK, 0
        pixld   , 4, 0, DST, 0
.endm

.macro over_n_8888_8888_ca_1pixel_tail
        ldrd    A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
        uxtb16  WK1, WK6                 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
        teq     WK6, WK6, asr #32        /* Zc if transparent, ZC if opaque */
        bne     20f
        bcc     40f
        /* Mask is fully opaque (all channels) */
        ldr     WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
        eors    A_SRC, A_SRC, #0xFF
        bne     10f
        /* Source is also opaque - same as src_8888_8888 */
        mov     WK0, WK6
        b       30f
10:     /* Same as over_8888_8888 */
        mul_8888_8 WK0, A_SRC, WK5, HALF
        uqadd8  WK0, WK0, WK6
        b       30f
20:     /* No simplifications possible - do it the hard way */
        uxtb16  WK2, WK6, ror #8         /* ag_mask */
        mla     WK3, WK1, A_SRC, HALF    /* rb_mul; 2 cycles */
        mla     WK4, WK2, A_SRC, HALF    /* ag_mul; 2 cycles */
        ldrd    RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
        uxtb16  WK5, WK0                 /* rb_dest */
        uxtab16 WK3, WK3, WK3, ror #8
        uxtb16  WK6, WK0, ror #8         /* ag_dest */
        uxtab16 WK4, WK4, WK4, ror #8
        smlatt  WK0, RB_SRC, WK1, HALF   /* red1 */
        smlabb  WK1, RB_SRC, WK1, HALF   /* blue1 */
        bic     WK3, RB_FLDS, WK3, lsr #8
        bic     WK4, RB_FLDS, WK4, lsr #8
        pkhbt   WK1, WK1, WK0, lsl #16   /* rb1 */
        smlatt  WK0, WK5, WK3, HALF      /* red2 */
        smlabb  WK3, WK5, WK3, HALF      /* blue2 */
        uxtab16 WK1, WK1, WK1, ror #8
        smlatt  WK5, AG_SRC, WK2, HALF   /* alpha1 */
        pkhbt   WK3, WK3, WK0, lsl #16   /* rb2 */
        smlabb  WK0, AG_SRC, WK2, HALF   /* green1 */
        smlatt  WK2, WK6, WK4, HALF      /* alpha2 */
        smlabb  WK4, WK6, WK4, HALF      /* green2 */
        pkhbt   WK0, WK0, WK5, lsl #16   /* ag1 */
        uxtab16 WK3, WK3, WK3, ror #8
        pkhbt   WK4, WK4, WK2, lsl #16   /* ag2 */
        uxtab16 WK0, WK0, WK0, ror #8
        uxtab16 WK4, WK4, WK4, ror #8
        mov     WK1, WK1, ror #8
        mov     WK3, WK3, ror #8
        sel     WK2, WK1, WK0            /* recombine source*mask */
        sel     WK1, WK3, WK4            /* recombine dest*(1-source_alpha*mask) */
        uqadd8  WK0, WK1, WK2            /* followed by 1 stall */
30:     /* The destination buffer is already in the L1 cache, so
         * there's little point in amalgamating writes */
        pixst   , 4, 0, DST
40:
.endm

.macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
.rept (\numbytes / 4) - 1
        over_n_8888_8888_ca_1pixel_head
        over_n_8888_8888_ca_1pixel_tail
.endr
        over_n_8888_8888_ca_1pixel_head
.endm

.macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg
        over_n_8888_8888_ca_1pixel_tail
.endm

pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
        ldr     ip, [sp]
        cmp     ip, #-1
        beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
        /* else drop through... */
pixman_end_asm_function
generate_composite_function \
    pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
    2, /* prefetch distance */ \
    over_n_8888_8888_ca_init, \
    nop_macro, /* newline */ \
    over_n_8888_8888_ca_cleanup, \
    over_n_8888_8888_ca_process_head, \
    over_n_8888_8888_ca_process_tail

/******************************************************************************/

.macro in_reverse_8888_8888_init
        /* Hold loop invariant in MASK */
        ldr     MASK, =0x00800080
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        uadd8   SCRATCH, MASK, MASK
        /* Offset the source pointer: we only need the alpha bytes */
        add     SRC, SRC, #3
        line_saved_regs  ORIG_W
.endm

.macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
        ldrb    ORIG_W, [SRC], #4
.if \numbytes >= 8
        ldrb    WK\()\reg1, [SRC], #4
  .if \numbytes == 16
        ldrb    WK\()\reg2, [SRC], #4
        ldrb    WK\()\reg3, [SRC], #4
  .endif
.endif
        add     DST, DST, #\numbytes
.endm

.macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        in_reverse_8888_8888_head  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
.endm

.macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
.if \is_only != 1
        movs    \s, ORIG_W
  .if \offset != 0
        ldrb    ORIG_W, [SRC, #\offset]
  .endif
        beq     1f
        teq     STRIDE_M, #0xFF
        beq     2f
.endif
        uxtb16  SCRATCH, \d                 /* rb_dest */
        uxtb16  \d, \d, ror #8               /* ag_dest */
        mla     SCRATCH, SCRATCH, \s, MASK
        mla     \d, \d, \s, MASK
        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
        uxtab16 \d, \d, \d, ror #8
        mov     SCRATCH, SCRATCH, ror #8
        sel     \d, SCRATCH, \d
        b       2f
.if \offset == 0
48:     /* Last mov d,#0 of the set - used as part of shortcut for
         * source values all 0 */
.endif
1:      mov     \d, #0
2:
.endm

.macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
.if \numbytes == 4
        teq     ORIG_W, ORIG_W, asr #32
        ldrne   WK\()\reg1, [DST, #-4]
.elseif \numbytes == 8
        teq     ORIG_W, WK\()\reg1
        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
        ldmdbne DST, {WK\()\reg1-WK\()\reg2}
.else
        teq     ORIG_W, WK\()\reg1
        teqeq   ORIG_W, WK\()\reg2
        teqeq   ORIG_W, WK\()\reg3
        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
        ldmdbne DST, {WK\()\reg1-WK\()\reg4}
.endif
        cmnne   DST, #0   /* clear C if NE */
        bcs     49f       /* no writes to dest if source all -1 */
        beq     48f       /* set dest to all 0 if source all 0 */
.if \numbytes == 4
        in_reverse_8888_8888_1pixel  ORIG_W, WK\()\reg1, 0, 1
        str     WK\()\reg1, [DST, #-4]
.elseif \numbytes == 8
        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -4, 0
        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, 0, 0
        stmdb   DST, {WK\()\reg1-WK\()\reg2}
.else
        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -12, 0
        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, -8, 0
        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg3, -4, 0
        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg4, 0, 0
        stmdb   DST, {WK\()\reg1-WK\()\reg4}
.endif
49:
.endm

.macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
        in_reverse_8888_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
.endm

generate_composite_function \
    pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
    2, /* prefetch distance */ \
    in_reverse_8888_8888_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    in_reverse_8888_8888_process_head, \
    in_reverse_8888_8888_process_tail

/******************************************************************************/

.macro over_n_8888_init
        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
        /* Hold loop invariant in MASK */
        ldr     MASK, =0x00800080
        /* Hold multiplier for destination in STRIDE_M */
        mov     STRIDE_M, #255
        sub     STRIDE_M, STRIDE_M, SRC, lsr #24
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        uadd8   SCRATCH, MASK, MASK
.endm

.macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   , \numbytes, \firstreg, DST, 0
.endm

.macro over_n_8888_1pixel dst
        mul_8888_8  WK\()\dst, STRIDE_M, SCRATCH, MASK
        uqadd8  WK\()\dst, WK\()\dst, SRC
.endm

.macro over_n_8888_process_tail  cond, numbytes, firstreg
.set PROCESS_REG, \firstreg
.rept \numbytes / 4
        over_n_8888_1pixel %(PROCESS_REG)
  .set PROCESS_REG, PROCESS_REG+1
.endr
        pixst   , \numbytes, \firstreg, DST
.endm

generate_composite_function \
    pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
    2, /* prefetch distance */ \
    over_n_8888_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    over_n_8888_process_head, \
    over_n_8888_process_tail

/******************************************************************************/

Messung V0.5

¤ Dauer der Verarbeitung: 0.62 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.