/*
* Copyright (c) 2022 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
#include <arm_neon.h>
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/arm/fdct_neon.h"
// Load & cross the first 8 and last 8, then the middle
static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
b[0 ] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
b[1 ] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
b[2 ] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
b[3 ] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
b[4 ] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
b[5 ] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
b[6 ] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
b[7 ] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
b[24 ] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
b[25 ] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
b[26 ] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
b[27 ] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
b[28 ] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
b[29 ] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
b[30 ] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
b[31 ] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
b[8 ] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
b[9 ] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
b[10 ] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
b[11 ] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
b[12 ] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
b[13 ] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
b[14 ] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
b[15 ] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
b[16 ] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
b[17 ] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
b[18 ] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
b[19 ] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
b[20 ] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
b[21 ] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
b[22 ] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
b[23 ] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
}
#define STORE_S16(src, index, dest) \
do { \
store_s16q_to_tran_low(dest, src[index]); \
dest += 8 ; \
} while (0 )
// Store 32 16x8 values, assuming stride == 32.
// Slight twist: store horizontally in blocks of 8.
static INLINE void store(tran_low_t *a, const int16x8_t *b) {
STORE_S16(b, 0 , a);
STORE_S16(b, 8 , a);
STORE_S16(b, 16 , a);
STORE_S16(b, 24 , a);
STORE_S16(b, 1 , a);
STORE_S16(b, 9 , a);
STORE_S16(b, 17 , a);
STORE_S16(b, 25 , a);
STORE_S16(b, 2 , a);
STORE_S16(b, 10 , a);
STORE_S16(b, 18 , a);
STORE_S16(b, 26 , a);
STORE_S16(b, 3 , a);
STORE_S16(b, 11 , a);
STORE_S16(b, 19 , a);
STORE_S16(b, 27 , a);
STORE_S16(b, 4 , a);
STORE_S16(b, 12 , a);
STORE_S16(b, 20 , a);
STORE_S16(b, 28 , a);
STORE_S16(b, 5 , a);
STORE_S16(b, 13 , a);
STORE_S16(b, 21 , a);
STORE_S16(b, 29 , a);
STORE_S16(b, 6 , a);
STORE_S16(b, 14 , a);
STORE_S16(b, 22 , a);
STORE_S16(b, 30 , a);
STORE_S16(b, 7 , a);
STORE_S16(b, 15 , a);
STORE_S16(b, 23 , a);
STORE_S16(b, 31 , a);
}
#undef STORE_S16
static INLINE void scale_input(const int16x8_t *in /*32*/,
int16x8_t *out /*32*/) {
out[0 ] = vshlq_n_s16(in[0 ], 2 );
out[1 ] = vshlq_n_s16(in[1 ], 2 );
out[2 ] = vshlq_n_s16(in[2 ], 2 );
out[3 ] = vshlq_n_s16(in[3 ], 2 );
out[4 ] = vshlq_n_s16(in[4 ], 2 );
out[5 ] = vshlq_n_s16(in[5 ], 2 );
out[6 ] = vshlq_n_s16(in[6 ], 2 );
out[7 ] = vshlq_n_s16(in[7 ], 2 );
out[8 ] = vshlq_n_s16(in[8 ], 2 );
out[9 ] = vshlq_n_s16(in[9 ], 2 );
out[10 ] = vshlq_n_s16(in[10 ], 2 );
out[11 ] = vshlq_n_s16(in[11 ], 2 );
out[12 ] = vshlq_n_s16(in[12 ], 2 );
out[13 ] = vshlq_n_s16(in[13 ], 2 );
out[14 ] = vshlq_n_s16(in[14 ], 2 );
out[15 ] = vshlq_n_s16(in[15 ], 2 );
out[16 ] = vshlq_n_s16(in[16 ], 2 );
out[17 ] = vshlq_n_s16(in[17 ], 2 );
out[18 ] = vshlq_n_s16(in[18 ], 2 );
out[19 ] = vshlq_n_s16(in[19 ], 2 );
out[20 ] = vshlq_n_s16(in[20 ], 2 );
out[21 ] = vshlq_n_s16(in[21 ], 2 );
out[22 ] = vshlq_n_s16(in[22 ], 2 );
out[23 ] = vshlq_n_s16(in[23 ], 2 );
out[24 ] = vshlq_n_s16(in[24 ], 2 );
out[25 ] = vshlq_n_s16(in[25 ], 2 );
out[26 ] = vshlq_n_s16(in[26 ], 2 );
out[27 ] = vshlq_n_s16(in[27 ], 2 );
out[28 ] = vshlq_n_s16(in[28 ], 2 );
out[29 ] = vshlq_n_s16(in[29 ], 2 );
out[30 ] = vshlq_n_s16(in[30 ], 2 );
out[31 ] = vshlq_n_s16(in[31 ], 2 );
}
static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
int16x8_t a[32 ];
int16x8_t b[32 ];
// Stage 1: Done as part of the load.
// Stage 2.
// Mini cross. X the first 16 values and the middle 8 of the second half.
a[0 ] = vaddq_s16(in[0 ], in[15 ]);
a[1 ] = vaddq_s16(in[1 ], in[14 ]);
a[2 ] = vaddq_s16(in[2 ], in[13 ]);
a[3 ] = vaddq_s16(in[3 ], in[12 ]);
a[4 ] = vaddq_s16(in[4 ], in[11 ]);
a[5 ] = vaddq_s16(in[5 ], in[10 ]);
a[6 ] = vaddq_s16(in[6 ], in[9 ]);
a[7 ] = vaddq_s16(in[7 ], in[8 ]);
a[8 ] = vsubq_s16(in[7 ], in[8 ]);
a[9 ] = vsubq_s16(in[6 ], in[9 ]);
a[10 ] = vsubq_s16(in[5 ], in[10 ]);
a[11 ] = vsubq_s16(in[4 ], in[11 ]);
a[12 ] = vsubq_s16(in[3 ], in[12 ]);
a[13 ] = vsubq_s16(in[2 ], in[13 ]);
a[14 ] = vsubq_s16(in[1 ], in[14 ]);
a[15 ] = vsubq_s16(in[0 ], in[15 ]);
a[16 ] = in[16 ];
a[17 ] = in[17 ];
a[18 ] = in[18 ];
a[19 ] = in[19 ];
butterfly_one_coeff_s16_s32_narrow(in[27 ], in[20 ], cospi_16_64, &a[27 ],
&a[20 ]);
butterfly_one_coeff_s16_s32_narrow(in[26 ], in[21 ], cospi_16_64, &a[26 ],
&a[21 ]);
butterfly_one_coeff_s16_s32_narrow(in[25 ], in[22 ], cospi_16_64, &a[25 ],
&a[22 ]);
butterfly_one_coeff_s16_s32_narrow(in[24 ], in[23 ], cospi_16_64, &a[24 ],
&a[23 ]);
a[28 ] = in[28 ];
a[29 ] = in[29 ];
a[30 ] = in[30 ];
a[31 ] = in[31 ];
// Stage 3.
b[0 ] = vaddq_s16(a[0 ], a[7 ]);
b[1 ] = vaddq_s16(a[1 ], a[6 ]);
b[2 ] = vaddq_s16(a[2 ], a[5 ]);
b[3 ] = vaddq_s16(a[3 ], a[4 ]);
b[4 ] = vsubq_s16(a[3 ], a[4 ]);
b[5 ] = vsubq_s16(a[2 ], a[5 ]);
b[6 ] = vsubq_s16(a[1 ], a[6 ]);
b[7 ] = vsubq_s16(a[0 ], a[7 ]);
b[8 ] = a[8 ];
b[9 ] = a[9 ];
butterfly_one_coeff_s16_s32_narrow(a[13 ], a[10 ], cospi_16_64, &b[13 ], &b[10 ]);
butterfly_one_coeff_s16_s32_narrow(a[12 ], a[11 ], cospi_16_64, &b[12 ], &b[11 ]);
b[14 ] = a[14 ];
b[15 ] = a[15 ];
b[16 ] = vaddq_s16(in[16 ], a[23 ]);
b[17 ] = vaddq_s16(in[17 ], a[22 ]);
b[18 ] = vaddq_s16(in[18 ], a[21 ]);
b[19 ] = vaddq_s16(in[19 ], a[20 ]);
b[20 ] = vsubq_s16(in[19 ], a[20 ]);
b[21 ] = vsubq_s16(in[18 ], a[21 ]);
b[22 ] = vsubq_s16(in[17 ], a[22 ]);
b[23 ] = vsubq_s16(in[16 ], a[23 ]);
b[24 ] = vsubq_s16(in[31 ], a[24 ]);
b[25 ] = vsubq_s16(in[30 ], a[25 ]);
b[26 ] = vsubq_s16(in[29 ], a[26 ]);
b[27 ] = vsubq_s16(in[28 ], a[27 ]);
b[28 ] = vaddq_s16(in[28 ], a[27 ]);
b[29 ] = vaddq_s16(in[29 ], a[26 ]);
b[30 ] = vaddq_s16(in[30 ], a[25 ]);
b[31 ] = vaddq_s16(in[31 ], a[24 ]);
// Stage 4.
a[0 ] = vaddq_s16(b[0 ], b[3 ]);
a[1 ] = vaddq_s16(b[1 ], b[2 ]);
a[2 ] = vsubq_s16(b[1 ], b[2 ]);
a[3 ] = vsubq_s16(b[0 ], b[3 ]);
a[4 ] = b[4 ];
butterfly_one_coeff_s16_s32_narrow(b[6 ], b[5 ], cospi_16_64, &a[6 ], &a[5 ]);
a[7 ] = b[7 ];
a[8 ] = vaddq_s16(b[8 ], b[11 ]);
a[9 ] = vaddq_s16(b[9 ], b[10 ]);
a[10 ] = vsubq_s16(b[9 ], b[10 ]);
a[11 ] = vsubq_s16(b[8 ], b[11 ]);
a[12 ] = vsubq_s16(b[15 ], b[12 ]);
a[13 ] = vsubq_s16(b[14 ], b[13 ]);
a[14 ] = vaddq_s16(b[14 ], b[13 ]);
a[15 ] = vaddq_s16(b[15 ], b[12 ]);
a[16 ] = b[16 ];
a[17 ] = b[17 ];
butterfly_two_coeff(b[29 ], b[18 ], cospi_8_64, cospi_24_64, &a[>29 ], &a[18 ]);
butterfly_two_coeff(b[28 ], b[19 ], cospi_8_64, cospi_24_64, &a[>28 ], &a[19 ]);
butterfly_two_coeff(b[27 ], b[20 ], cospi_24_64, -cospi_8_64, &a[27 ], &a[20 ]);
butterfly_two_coeff(b[26 ], b[21 ], cospi_24_64, -cospi_8_64, &a[26 ], &a[21 ]);
a[22 ] = b[22 ];
a[23 ] = b[23 ];
a[24 ] = b[24 ];
a[25 ] = b[25 ];
a[30 ] = b[30 ];
a[31 ] = b[31 ];
// Stage 5.
butterfly_one_coeff_s16_fast(a[0 ], a[1 ], cospi_16_64, &b[0 ], &b[<span style='color: green'>1]);
butterfly_two_coeff(a[3 ], a[2 ], cospi_8_64, cospi_24_64, &b[2 ], &b[3 ]);
b[4 ] = vaddq_s16(a[4 ], a[5 ]);
b[5 ] = vsubq_s16(a[4 ], a[5 ]);
b[6 ] = vsubq_s16(a[7 ], a[6 ]);
b[7 ] = vaddq_s16(a[7 ], a[6 ]);
b[8 ] = a[8 ];
butterfly_two_coeff(a[14 ], a[9 ], cospi_8_64, cospi_24_64, &b[14 ], &b[9 ]);
butterfly_two_coeff(a[13 ], a[10 ], cospi_24_64, -cospi_8_64, &b[13 ], &b[10 ]);
b[11 ] = a[11 ];
b[12 ] = a[12 ];
b[15 ] = a[15 ];
b[16 ] = vaddq_s16(a[19 ], a[16 ]);
b[17 ] = vaddq_s16(a[18 ], a[17 ]);
b[18 ] = vsubq_s16(a[17 ], a[18 ]);
b[19 ] = vsubq_s16(a[16 ], a[19 ]);
b[20 ] = vsubq_s16(a[23 ], a[20 ]);
b[21 ] = vsubq_s16(a[22 ], a[21 ]);
b[22 ] = vaddq_s16(a[21 ], a[22 ]);
b[23 ] = vaddq_s16(a[20 ], a[23 ]);
b[24 ] = vaddq_s16(a[27 ], a[24 ]);
b[25 ] = vaddq_s16(a[26 ], a[25 ]);
b[26 ] = vsubq_s16(a[25 ], a[26 ]);
b[27 ] = vsubq_s16(a[24 ], a[27 ]);
b[28 ] = vsubq_s16(a[31 ], a[28 ]);
b[29 ] = vsubq_s16(a[30 ], a[29 ]);
b[30 ] = vaddq_s16(a[29 ], a[30 ]);
b[31 ] = vaddq_s16(a[28 ], a[31 ]);
// Stage 6.
a[0 ] = b[0 ];
a[1 ] = b[1 ];
a[2 ] = b[2 ];
a[3 ] = b[3 ];
butterfly_two_coeff(b[7 ], b[4 ], cospi_4_64, cospi_28_64, &a[4 ], &a[7 ]);
butterfly_two_coeff(b[6 ], b[5 ], cospi_20_64, cospi_12_64, &a[5 ], &a[6 ]);
a[8 ] = vaddq_s16(b[8 ], b[9 ]);
a[9 ] = vsubq_s16(b[8 ], b[9 ]);
a[10 ] = vsubq_s16(b[11 ], b[10 ]);
a[11 ] = vaddq_s16(b[11 ], b[10 ]);
a[12 ] = vaddq_s16(b[12 ], b[13 ]);
a[13 ] = vsubq_s16(b[12 ], b[13 ]);
a[14 ] = vsubq_s16(b[15 ], b[14 ]);
a[15 ] = vaddq_s16(b[15 ], b[14 ]);
a[16 ] = b[16 ];
a[19 ] = b[19 ];
a[20 ] = b[20 ];
a[23 ] = b[23 ];
a[24 ] = b[24 ];
a[27 ] = b[27 ];
a[28 ] = b[28 ];
a[31 ] = b[31 ];
butterfly_two_coeff(b[30 ], b[17 ], cospi_4_64, cospi_28_64, &a[>30 ], &a[17 ]);
butterfly_two_coeff(b[29 ], b[18 ], cospi_28_64, -cospi_4_64, &a[29 ], &a[18 ]);
butterfly_two_coeff(b[26 ], b[21 ], cospi_20_64, cospi_12_64, &a[26 ], &a[21 ]);
butterfly_two_coeff(b[25 ], b[22 ], cospi_12_64, -cospi_20_64, &a[25 ], &a[22 ]);
// Stage 7.
b[0 ] = a[0 ];
b[1 ] = a[1 ];
b[2 ] = a[2 ];
b[3 ] = a[3 ];
b[4 ] = a[4 ];
b[5 ] = a[5 ];
b[6 ] = a[6 ];
b[7 ] = a[7 ];
butterfly_two_coeff(a[15 ], a[8 ], cospi_2_64, cospi_30_64, &b[8 ], &b[15 ]);
butterfly_two_coeff(a[14 ], a[9 ], cospi_18_64, cospi_14_64, &b[>9 ], &b[14 ]);
butterfly_two_coeff(a[13 ], a[10 ], cospi_10_64, cospi_22_64, &b[10 ], &b[13 ]);
butterfly_two_coeff(a[12 ], a[11 ], cospi_26_64, cospi_6_64, &b[>11 ], &b[12 ]);
b[16 ] = vaddq_s16(a[16 ], a[17 ]);
b[17 ] = vsubq_s16(a[16 ], a[17 ]);
b[18 ] = vsubq_s16(a[19 ], a[18 ]);
b[19 ] = vaddq_s16(a[19 ], a[18 ]);
b[20 ] = vaddq_s16(a[20 ], a[21 ]);
b[21 ] = vsubq_s16(a[20 ], a[21 ]);
b[22 ] = vsubq_s16(a[23 ], a[22 ]);
b[23 ] = vaddq_s16(a[23 ], a[22 ]);
b[24 ] = vaddq_s16(a[24 ], a[25 ]);
b[25 ] = vsubq_s16(a[24 ], a[25 ]);
b[26 ] = vsubq_s16(a[27 ], a[26 ]);
b[27 ] = vaddq_s16(a[27 ], a[26 ]);
b[28 ] = vaddq_s16(a[28 ], a[29 ]);
b[29 ] = vsubq_s16(a[28 ], a[29 ]);
b[30 ] = vsubq_s16(a[31 ], a[30 ]);
b[31 ] = vaddq_s16(a[31 ], a[30 ]);
// Final stage.
// Also compute partial rounding shift:
// output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
out[0 ] = sub_round_shift_s16(b[0 ]);
out[16 ] = sub_round_shift_s16(b[1 ]);
out[8 ] = sub_round_shift_s16(b[2 ]);
out[24 ] = sub_round_shift_s16(b[3 ]);
out[4 ] = sub_round_shift_s16(b[4 ]);
out[20 ] = sub_round_shift_s16(b[5 ]);
out[12 ] = sub_round_shift_s16(b[6 ]);
out[28 ] = sub_round_shift_s16(b[7 ]);
out[2 ] = sub_round_shift_s16(b[8 ]);
out[18 ] = sub_round_shift_s16(b[9 ]);
out[10 ] = sub_round_shift_s16(b[10 ]);
out[26 ] = sub_round_shift_s16(b[11 ]);
out[6 ] = sub_round_shift_s16(b[12 ]);
out[22 ] = sub_round_shift_s16(b[13 ]);
out[14 ] = sub_round_shift_s16(b[14 ]);
out[30 ] = sub_round_shift_s16(b[15 ]);
butterfly_two_coeff(b[31 ], b[16 ], cospi_1_64, cospi_31_64, &a[>1 ], &a[31 ]);
out[1 ] = sub_round_shift_s16(a[1 ]);
out[31 ] = sub_round_shift_s16(a[31 ]);
butterfly_two_coeff(b[30 ], b[17 ], cospi_17_64, cospi_15_64, &a[17 ], &a[15 ]);
out[17 ] = sub_round_shift_s16(a[17 ]);
out[15 ] = sub_round_shift_s16(a[15 ]);
butterfly_two_coeff(b[29 ], b[18 ], cospi_9_64, cospi_23_64, &a[>9 ], &a[23 ]);
out[9 ] = sub_round_shift_s16(a[9 ]);
out[23 ] = sub_round_shift_s16(a[23 ]);
butterfly_two_coeff(b[28 ], b[19 ], cospi_25_64, cospi_7_64, &a[>25 ], &a[7 ]);
out[25 ] = sub_round_shift_s16(a[25 ]);
out[7 ] = sub_round_shift_s16(a[7 ]);
butterfly_two_coeff(b[27 ], b[20 ], cospi_5_64, cospi_27_64, &a[>5 ], &a[27 ]);
out[5 ] = sub_round_shift_s16(a[5 ]);
out[27 ] = sub_round_shift_s16(a[27 ]);
butterfly_two_coeff(b[26 ], b[21 ], cospi_21_64, cospi_11_64, &a[21 ], &a[11 ]);
out[21 ] = sub_round_shift_s16(a[21 ]);
out[11 ] = sub_round_shift_s16(a[11 ]);
butterfly_two_coeff(b[25 ], b[22 ], cospi_13_64, cospi_19_64, &a[13 ], &a[19 ]);
out[13 ] = sub_round_shift_s16(a[13 ]);
out[19 ] = sub_round_shift_s16(a[19 ]);
butterfly_two_coeff(b[24 ], b[23 ], cospi_29_64, cospi_3_64, &a[>29 ], &a[3 ]);
out[29 ] = sub_round_shift_s16(a[29 ]);
out[3 ] = sub_round_shift_s16(a[3 ]);
}
#define PASS_THROUGH(src, dst, element) \
do { \
dst## _lo[element] = src## _lo[element]; \
dst## _hi[element] = src## _hi[element]; \
} while (0 )
#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
do { \
b## _lo[b_index] = \
vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
b## _hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
vget_high_s16(a[right_index])); \
} while (0 )
#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
do { \
b## _lo[b_index] = \
vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
b## _hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
vget_high_s16(a[right_index])); \
} while (0 )
#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
do { \
c## _lo[c_index] = vaddw_s16(a## _lo[a_index], vget_low_s16(b[b_index])); \
c## _hi[c_index] = vaddw_s16(a## _hi[a_index], vget_high_s16(b[b_index])); \
} while (0 )
#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
do { \
temp## _lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
temp## _hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
c## _lo[c_index] = vsubq_s32(temp## _lo[temp_index], b## _lo[b_index]); \
c## _hi[c_index] = vsubq_s32(temp## _hi[temp_index], b## _hi[b_index]); \
} while (0 )
#define ADD_S32(a, left_index, right_index, b, b_index) \
do { \
b## _lo[b_index] = vaddq_s32(a## _lo[left_index], a## _lo[right_index]); \
b## _hi[b_index] = vaddq_s32(a## _hi[left_index], a## _hi[right_index]); \
} while (0 )
#define SUB_S32(a, left_index, right_index, b, b_index) \
do { \
b## _lo[b_index] = vsubq_s32(a## _lo[left_index], a## _lo[right_index]); \
b## _hi[b_index] = vsubq_s32(a## _hi[left_index], a## _hi[right_index]); \
} while (0 )
#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
add_index, sub_index) \
do { \
butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
&b## _lo[add_index], &b## _hi[add_index], \
&b## _lo[sub_index], &b## _hi[sub_index]); \
} while (0 )
#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
sub_index) \
do { \
butterfly_one_coeff_s32_fast( \
a## _lo[left_index], a## _hi[left_index], a## _lo[right_index], \
a## _hi[right_index], constant, &b## _lo[add_index], &bstyle='color:turquoise'>## _hi[add_index], \
&b## _lo[sub_index], &b## _hi[sub_index]); \
} while (0 )
#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
right_constant, b, add_index, sub_index) \
do { \
butterfly_two_coeff_s32(a## _lo[left_index], a## _hi[left_index], \
a## _lo[right_index], a## _hi[right_index], \
left_constant, right_constant, &b## _lo[add_index], \
&b## _hi[add_index], &b## _lo[sub_index], \
&b## _hi[sub_index]); \
} while (0 )
static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
int16x8_t a[32 ];
int16x8_t b[32 ];
int32x4_t c_lo[32 ];
int32x4_t c_hi[32 ];
int32x4_t d_lo[32 ];
int32x4_t d_hi[32 ];
// Stage 1. Done as part of the load for the first pass.
a[0 ] = vaddq_s16(in[0 ], in[31 ]);
a[1 ] = vaddq_s16(in[1 ], in[30 ]);
a[2 ] = vaddq_s16(in[2 ], in[29 ]);
a[3 ] = vaddq_s16(in[3 ], in[28 ]);
a[4 ] = vaddq_s16(in[4 ], in[27 ]);
a[5 ] = vaddq_s16(in[5 ], in[26 ]);
a[6 ] = vaddq_s16(in[6 ], in[25 ]);
a[7 ] = vaddq_s16(in[7 ], in[24 ]);
a[8 ] = vaddq_s16(in[8 ], in[23 ]);
a[9 ] = vaddq_s16(in[9 ], in[22 ]);
a[10 ] = vaddq_s16(in[10 ], in[21 ]);
a[11 ] = vaddq_s16(in[11 ], in[20 ]);
a[12 ] = vaddq_s16(in[12 ], in[19 ]);
a[13 ] = vaddq_s16(in[13 ], in[18 ]);
a[14 ] = vaddq_s16(in[14 ], in[17 ]);
a[15 ] = vaddq_s16(in[15 ], in[16 ]);
a[16 ] = vsubq_s16(in[15 ], in[16 ]);
a[17 ] = vsubq_s16(in[14 ], in[17 ]);
a[18 ] = vsubq_s16(in[13 ], in[18 ]);
a[19 ] = vsubq_s16(in[12 ], in[19 ]);
a[20 ] = vsubq_s16(in[11 ], in[20 ]);
a[21 ] = vsubq_s16(in[10 ], in[21 ]);
a[22 ] = vsubq_s16(in[9 ], in[22 ]);
a[23 ] = vsubq_s16(in[8 ], in[23 ]);
a[24 ] = vsubq_s16(in[7 ], in[24 ]);
a[25 ] = vsubq_s16(in[6 ], in[25 ]);
a[26 ] = vsubq_s16(in[5 ], in[26 ]);
a[27 ] = vsubq_s16(in[4 ], in[27 ]);
a[28 ] = vsubq_s16(in[3 ], in[28 ]);
a[29 ] = vsubq_s16(in[2 ], in[29 ]);
a[30 ] = vsubq_s16(in[1 ], in[30 ]);
a[31 ] = vsubq_s16(in[0 ], in[31 ]);
// Stage 2.
b[0 ] = vaddq_s16(a[0 ], a[15 ]);
b[1 ] = vaddq_s16(a[1 ], a[14 ]);
b[2 ] = vaddq_s16(a[2 ], a[13 ]);
b[3 ] = vaddq_s16(a[3 ], a[12 ]);
b[4 ] = vaddq_s16(a[4 ], a[11 ]);
b[5 ] = vaddq_s16(a[5 ], a[10 ]);
b[6 ] = vaddq_s16(a[6 ], a[9 ]);
b[7 ] = vaddq_s16(a[7 ], a[8 ]);
b[8 ] = vsubq_s16(a[7 ], a[8 ]);
b[9 ] = vsubq_s16(a[6 ], a[9 ]);
b[10 ] = vsubq_s16(a[5 ], a[10 ]);
b[11 ] = vsubq_s16(a[4 ], a[11 ]);
b[12 ] = vsubq_s16(a[3 ], a[12 ]);
b[13 ] = vsubq_s16(a[2 ], a[13 ]);
b[14 ] = vsubq_s16(a[1 ], a[14 ]);
b[15 ] = vsubq_s16(a[0 ], a[15 ]);
b[16 ] = a[16 ];
b[17 ] = a[17 ];
b[18 ] = a[18 ];
b[19 ] = a[19 ];
butterfly_one_coeff_s16_s32_narrow(a[27 ], a[20 ], cospi_16_64, &b[27 ], &b[20 ]);
butterfly_one_coeff_s16_s32_narrow(a[26 ], a[21 ], cospi_16_64, &b[26 ], &b[21 ]);
butterfly_one_coeff_s16_s32_narrow(a[25 ], a[22 ], cospi_16_64, &b[25 ], &b[22 ]);
butterfly_one_coeff_s16_s32_narrow(a[24 ], a[23 ], cospi_16_64, &b[24 ], &b[23 ]);
b[28 ] = a[28 ];
b[29 ] = a[29 ];
b[30 ] = a[30 ];
b[31 ] = a[31 ];
// Stage 3. With extreme values for input this calculation rolls over int16_t.
// The sources for b[0] get added multiple times and, through testing, have
// been shown to overflow starting here.
ADD_S16_S32(b, 0 , 7 , c, 0 );
ADD_S16_S32(b, 1 , 6 , c, 1 );
ADD_S16_S32(b, 2 , 5 , c, 2 );
ADD_S16_S32(b, 3 , 4 , c, 3 );
SUB_S16_S32(b, 3 , 4 , c, 4 );
SUB_S16_S32(b, 2 , 5 , c, 5 );
SUB_S16_S32(b, 1 , 6 , c, 6 );
SUB_S16_S32(b, 0 , 7 , c, 7 );
a[8 ] = b[8 ];
a[9 ] = b[9 ];
BUTTERFLY_ONE_S16_S32(b, 13 , 10 , cospi_16_64, c, 13 , 10 );
BUTTERFLY_ONE_S16_S32(b, 12 , 11 , cospi_16_64, c, 12 , 11 );
a[14 ] = b[14 ];
a[15 ] = b[15 ];
ADD_S16_S32(b, 16 , 23 , c, 16 );
ADD_S16_S32(b, 17 , 22 , c, 17 );
ADD_S16_S32(b, 18 , 21 , c, 18 );
ADD_S16_S32(b, 19 , 20 , c, 19 );
SUB_S16_S32(b, 19 , 20 , c, 20 );
SUB_S16_S32(b, 18 , 21 , c, 21 );
SUB_S16_S32(b, 17 , 22 , c, 22 );
SUB_S16_S32(b, 16 , 23 , c, 23 );
SUB_S16_S32(b, 31 , 24 , c, 24 );
SUB_S16_S32(b, 30 , 25 , c, 25 );
SUB_S16_S32(b, 29 , 26 , c, 26 );
SUB_S16_S32(b, 28 , 27 , c, 27 );
ADD_S16_S32(b, 28 , 27 , c, 28 );
ADD_S16_S32(b, 29 , 26 , c, 29 );
ADD_S16_S32(b, 30 , 25 , c, 30 );
ADD_S16_S32(b, 31 , 24 , c, 31 );
// Stage 4.
ADD_S32(c, 0 , 3 , d, 0 );
ADD_S32(c, 1 , 2 , d, 1 );
SUB_S32(c, 1 , 2 , d, 2 );
SUB_S32(c, 0 , 3 , d, 3 );
PASS_THROUGH(c, d, 4 );
BUTTERFLY_ONE_S32(c, 6 , 5 , cospi_16_64, d, 6 , 5 );
PASS_THROUGH(c, d, 7 );
ADDW_S16_S32(c, 11 , a, 8 , d, 8 );
ADDW_S16_S32(c, 10 , a, 9 , d, 9 );
SUBW_S16_S32(a, 9 , c, 10 , c, 9 , d, 10 );
SUBW_S16_S32(a, 8 , c, 11 , c, 8 , d, 11 );
SUBW_S16_S32(a, 15 , c, 12 , c, 15 , d, 12 );
SUBW_S16_S32(a, 14 , c, 13 , c, 14 , d, 13 );
ADDW_S16_S32(c, 13 , b, 14 , d, 14 );
ADDW_S16_S32(c, 12 , b, 15 , d, 15 );
PASS_THROUGH(c, d, 16 );
PASS_THROUGH(c, d, 17 );
BUTTERFLY_TWO_S32(c, 29 , 18 , cospi_8_64, cospi_24_64, d, 29 , 18 );
BUTTERFLY_TWO_S32(c, 28 , 19 , cospi_8_64, cospi_24_64, d, 28 , 19 );
BUTTERFLY_TWO_S32(c, 27 , 20 , cospi_24_64, -cospi_8_64, d, 27 , 20 );
BUTTERFLY_TWO_S32(c, 26 , 21 , cospi_24_64, -cospi_8_64, d, 26 , 21 );
PASS_THROUGH(c, d, 22 );
PASS_THROUGH(c, d, 23 );
PASS_THROUGH(c, d, 24 );
PASS_THROUGH(c, d, 25 );
PASS_THROUGH(c, d, 30 );
PASS_THROUGH(c, d, 31 );
// Stage 5.
BUTTERFLY_ONE_S32(d, 0 , 1 , cospi_16_64, c, 0 , 1 );
BUTTERFLY_TWO_S32(d, 3 , 2 , cospi_8_64, cospi_24_64, c, 2 , 3 );
ADD_S32(d, 4 , 5 , c, 4 );
SUB_S32(d, 4 , 5 , c, 5 );
SUB_S32(d, 7 , 6 , c, 6 );
ADD_S32(d, 7 , 6 , c, 7 );
PASS_THROUGH(d, c, 8 );
BUTTERFLY_TWO_S32(d, 14 , 9 , cospi_8_64, cospi_24_64, c, 14 , 9 );
BUTTERFLY_TWO_S32(d, 13 , 10 , cospi_24_64, -cospi_8_64, c, 13 , 10 );
PASS_THROUGH(d, c, 11 );
PASS_THROUGH(d, c, 12 );
PASS_THROUGH(d, c, 15 );
ADD_S32(d, 16 , 19 , c, 16 );
ADD_S32(d, 17 , 18 , c, 17 );
SUB_S32(d, 17 , 18 , c, 18 );
SUB_S32(d, 16 , 19 , c, 19 );
SUB_S32(d, 23 , 20 , c, 20 );
SUB_S32(d, 22 , 21 , c, 21 );
ADD_S32(d, 22 , 21 , c, 22 );
ADD_S32(d, 23 , 20 , c, 23 );
ADD_S32(d, 24 , 27 , c, 24 );
ADD_S32(d, 25 , 26 , c, 25 );
SUB_S32(d, 25 , 26 , c, 26 );
SUB_S32(d, 24 , 27 , c, 27 );
SUB_S32(d, 31 , 28 , c, 28 );
SUB_S32(d, 30 , 29 , c, 29 );
ADD_S32(d, 30 , 29 , c, 30 );
ADD_S32(d, 31 , 28 , c, 31 );
// Stage 6.
PASS_THROUGH(c, d, 0 );
PASS_THROUGH(c, d, 1 );
PASS_THROUGH(c, d, 2 );
PASS_THROUGH(c, d, 3 );
BUTTERFLY_TWO_S32(c, 7 , 4 , cospi_4_64, cospi_28_64, d, 4 , 7 );
BUTTERFLY_TWO_S32(c, 6 , 5 , cospi_20_64, cospi_12_64, d, 5 , 6 );
ADD_S32(c, 8 , 9 , d, 8 );
SUB_S32(c, 8 , 9 , d, 9 );
SUB_S32(c, 11 , 10 , d, 10 );
ADD_S32(c, 11 , 10 , d, 11 );
ADD_S32(c, 12 , 13 , d, 12 );
SUB_S32(c, 12 , 13 , d, 13 );
SUB_S32(c, 15 , 14 , d, 14 );
ADD_S32(c, 15 , 14 , d, 15 );
PASS_THROUGH(c, d, 16 );
PASS_THROUGH(c, d, 19 );
PASS_THROUGH(c, d, 20 );
PASS_THROUGH(c, d, 23 );
PASS_THROUGH(c, d, 24 );
PASS_THROUGH(c, d, 27 );
PASS_THROUGH(c, d, 28 );
PASS_THROUGH(c, d, 31 );
BUTTERFLY_TWO_S32(c, 30 , 17 , cospi_4_64, cospi_28_64, d, 30 , 17 );
BUTTERFLY_TWO_S32(c, 29 , 18 , cospi_28_64, -cospi_4_64, d, 29 , 18 );
BUTTERFLY_TWO_S32(c, 26 , 21 , cospi_20_64, cospi_12_64, d, 26 , 21 );
BUTTERFLY_TWO_S32(c, 25 , 22 , cospi_12_64, -cospi_20_64, d, 25 , 22 );
// Stage 7.
PASS_THROUGH(d, c, 0 );
PASS_THROUGH(d, c, 1 );
PASS_THROUGH(d, c, 2 );
PASS_THROUGH(d, c, 3 );
PASS_THROUGH(d, c, 4 );
PASS_THROUGH(d, c, 5 );
PASS_THROUGH(d, c, 6 );
PASS_THROUGH(d, c, 7 );
BUTTERFLY_TWO_S32(d, 15 , 8 , cospi_2_64, cospi_30_64, c, 8 , 15 );
BUTTERFLY_TWO_S32(d, 14 , 9 , cospi_18_64, cospi_14_64, c, 9 , 14 );
BUTTERFLY_TWO_S32(d, 13 , 10 , cospi_10_64, cospi_22_64, c, 10 , 13 );
BUTTERFLY_TWO_S32(d, 12 , 11 , cospi_26_64, cospi_6_64, c, 11 , 12 );
ADD_S32(d, 16 , 17 , c, 16 );
SUB_S32(d, 16 , 17 , c, 17 );
SUB_S32(d, 19 , 18 , c, 18 );
ADD_S32(d, 19 , 18 , c, 19 );
ADD_S32(d, 20 , 21 , c, 20 );
SUB_S32(d, 20 , 21 , c, 21 );
SUB_S32(d, 23 , 22 , c, 22 );
ADD_S32(d, 23 , 22 , c, 23 );
ADD_S32(d, 24 , 25 , c, 24 );
SUB_S32(d, 24 , 25 , c, 25 );
SUB_S32(d, 27 , 26 , c, 26 );
ADD_S32(d, 27 , 26 , c, 27 );
ADD_S32(d, 28 , 29 , c, 28 );
SUB_S32(d, 28 , 29 , c, 29 );
SUB_S32(d, 31 , 30 , c, 30 );
ADD_S32(d, 31 , 30 , c, 31 );
// Final stage.
// Roll rounding into this function so we can pass back int16x8.
out[0 ] = add_round_shift_s32_narrow(c_lo[0 ], c_hi[0 ]);
out[16 ] = add_round_shift_s32_narrow(c_lo[1 ], c_hi[1 ]);
out[8 ] = add_round_shift_s32_narrow(c_lo[2 ], c_hi[2 ]);
out[24 ] = add_round_shift_s32_narrow(c_lo[3 ], c_hi[3 ]);
out[4 ] = add_round_shift_s32_narrow(c_lo[4 ], c_hi[4 ]);
out[20 ] = add_round_shift_s32_narrow(c_lo[5 ], c_hi[5 ]);
out[12 ] = add_round_shift_s32_narrow(c_lo[6 ], c_hi[6 ]);
out[28 ] = add_round_shift_s32_narrow(c_lo[7 ], c_hi[7 ]);
out[2 ] = add_round_shift_s32_narrow(c_lo[8 ], c_hi[8 ]);
out[18 ] = add_round_shift_s32_narrow(c_lo[9 ], c_hi[9 ]);
out[10 ] = add_round_shift_s32_narrow(c_lo[10 ], c_hi[10 ]);
out[26 ] = add_round_shift_s32_narrow(c_lo[11 ], c_hi[11 ]);
out[6 ] = add_round_shift_s32_narrow(c_lo[12 ], c_hi[12 ]);
out[22 ] = add_round_shift_s32_narrow(c_lo[13 ], c_hi[13 ]);
out[14 ] = add_round_shift_s32_narrow(c_lo[14 ], c_hi[14 ]);
out[30 ] = add_round_shift_s32_narrow(c_lo[15 ], c_hi[15 ]);
BUTTERFLY_TWO_S32(c, 31 , 16 , cospi_1_64, cospi_31_64, d, 1 , 31 );
out[1 ] = add_round_shift_s32_narrow(d_lo[1 ], d_hi[1 ]);
out[31 ] = add_round_shift_s32_narrow(d_lo[31 ], d_hi[31 ]);
BUTTERFLY_TWO_S32(c, 30 , 17 , cospi_17_64, cospi_15_64, d, 17 , 15 );
out[17 ] = add_round_shift_s32_narrow(d_lo[17 ], d_hi[17 ]);
out[15 ] = add_round_shift_s32_narrow(d_lo[15 ], d_hi[15 ]);
BUTTERFLY_TWO_S32(c, 29 , 18 , cospi_9_64, cospi_23_64, d, 9 , 23 );
out[9 ] = add_round_shift_s32_narrow(d_lo[9 ], d_hi[9 ]);
out[23 ] = add_round_shift_s32_narrow(d_lo[23 ], d_hi[23 ]);
BUTTERFLY_TWO_S32(c, 28 , 19 , cospi_25_64, cospi_7_64, d, 25 , 7 );
out[25 ] = add_round_shift_s32_narrow(d_lo[25 ], d_hi[25 ]);
out[7 ] = add_round_shift_s32_narrow(d_lo[7 ], d_hi[7 ]);
BUTTERFLY_TWO_S32(c, 27 , 20 , cospi_5_64, cospi_27_64, d, 5 , 27 );
out[5 ] = add_round_shift_s32_narrow(d_lo[5 ], d_hi[5 ]);
out[27 ] = add_round_shift_s32_narrow(d_lo[27 ], d_hi[27 ]);
BUTTERFLY_TWO_S32(c, 26 , 21 , cospi_21_64, cospi_11_64, d, 21 , 11 );
out[21 ] = add_round_shift_s32_narrow(d_lo[21 ], d_hi[21 ]);
out[11 ] = add_round_shift_s32_narrow(d_lo[11 ], d_hi[11 ]);
BUTTERFLY_TWO_S32(c, 25 , 22 , cospi_13_64, cospi_19_64, d, 13 , 19 );
out[13 ] = add_round_shift_s32_narrow(d_lo[13 ], d_hi[13 ]);
out[19 ] = add_round_shift_s32_narrow(d_lo[19 ], d_hi[19 ]);
BUTTERFLY_TWO_S32(c, 24 , 23 , cospi_29_64, cospi_3_64, d, 29 , 3 );
out[29 ] = add_round_shift_s32_narrow(d_lo[29 ], d_hi[29 ]);
out[3 ] = add_round_shift_s32_narrow(d_lo[3 ], d_hi[3 ]);
}
static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
int16x8_t *out) {
int16x8_t a[32 ];
int16x8_t b[32 ];
// Stage 1. Done as part of the load for the first pass.
a[0 ] = vaddq_s16(in[0 ], in[31 ]);
a[1 ] = vaddq_s16(in[1 ], in[30 ]);
a[2 ] = vaddq_s16(in[2 ], in[29 ]);
a[3 ] = vaddq_s16(in[3 ], in[28 ]);
a[4 ] = vaddq_s16(in[4 ], in[27 ]);
a[5 ] = vaddq_s16(in[5 ], in[26 ]);
a[6 ] = vaddq_s16(in[6 ], in[25 ]);
a[7 ] = vaddq_s16(in[7 ], in[24 ]);
a[8 ] = vaddq_s16(in[8 ], in[23 ]);
a[9 ] = vaddq_s16(in[9 ], in[22 ]);
a[10 ] = vaddq_s16(in[10 ], in[21 ]);
a[11 ] = vaddq_s16(in[11 ], in[20 ]);
a[12 ] = vaddq_s16(in[12 ], in[19 ]);
a[13 ] = vaddq_s16(in[13 ], in[18 ]);
a[14 ] = vaddq_s16(in[14 ], in[17 ]);
a[15 ] = vaddq_s16(in[15 ], in[16 ]);
a[16 ] = vsubq_s16(in[15 ], in[16 ]);
a[17 ] = vsubq_s16(in[14 ], in[17 ]);
a[18 ] = vsubq_s16(in[13 ], in[18 ]);
a[19 ] = vsubq_s16(in[12 ], in[19 ]);
a[20 ] = vsubq_s16(in[11 ], in[20 ]);
a[21 ] = vsubq_s16(in[10 ], in[21 ]);
a[22 ] = vsubq_s16(in[9 ], in[22 ]);
a[23 ] = vsubq_s16(in[8 ], in[23 ]);
a[24 ] = vsubq_s16(in[7 ], in[24 ]);
a[25 ] = vsubq_s16(in[6 ], in[25 ]);
a[26 ] = vsubq_s16(in[5 ], in[26 ]);
a[27 ] = vsubq_s16(in[4 ], in[27 ]);
a[28 ] = vsubq_s16(in[3 ], in[28 ]);
a[29 ] = vsubq_s16(in[2 ], in[29 ]);
a[30 ] = vsubq_s16(in[1 ], in[30 ]);
a[31 ] = vsubq_s16(in[0 ], in[31 ]);
// Stage 2.
// For the "rd" version, all the values are rounded down after stage 2 to keep
// the values in 16 bits.
b[0 ] = add_round_shift_s16(vaddq_s16(a[0 ], a[15 ]));
b[1 ] = add_round_shift_s16(vaddq_s16(a[1 ], a[14 ]));
b[2 ] = add_round_shift_s16(vaddq_s16(a[2 ], a[13 ]));
b[3 ] = add_round_shift_s16(vaddq_s16(a[3 ], a[12 ]));
b[4 ] = add_round_shift_s16(vaddq_s16(a[4 ], a[11 ]));
b[5 ] = add_round_shift_s16(vaddq_s16(a[5 ], a[10 ]));
b[6 ] = add_round_shift_s16(vaddq_s16(a[6 ], a[9 ]));
b[7 ] = add_round_shift_s16(vaddq_s16(a[7 ], a[8 ]));
b[8 ] = add_round_shift_s16(vsubq_s16(a[7 ], a[8 ]));
b[9 ] = add_round_shift_s16(vsubq_s16(a[6 ], a[9 ]));
b[10 ] = add_round_shift_s16(vsubq_s16(a[5 ], a[10 ]));
b[11 ] = add_round_shift_s16(vsubq_s16(a[4 ], a[11 ]));
b[12 ] = add_round_shift_s16(vsubq_s16(a[3 ], a[12 ]));
b[13 ] = add_round_shift_s16(vsubq_s16(a[2 ], a[13 ]));
b[14 ] = add_round_shift_s16(vsubq_s16(a[1 ], a[14 ]));
b[15 ] = add_round_shift_s16(vsubq_s16(a[0 ], a[15 ]));
b[16 ] = add_round_shift_s16(a[16 ]);
b[17 ] = add_round_shift_s16(a[17 ]);
b[18 ] = add_round_shift_s16(a[18 ]);
b[19 ] = add_round_shift_s16(a[19 ]);
butterfly_one_coeff_s16_s32_narrow(a[27 ], a[20 ], cospi_16_64, &b[27 ], &b[20 ]);
butterfly_one_coeff_s16_s32_narrow(a[26 ], a[21 ], cospi_16_64, &b[26 ], &b[21 ]);
butterfly_one_coeff_s16_s32_narrow(a[25 ], a[22 ], cospi_16_64, &b[25 ], &b[22 ]);
butterfly_one_coeff_s16_s32_narrow(a[24 ], a[23 ], cospi_16_64, &b[24 ], &b[23 ]);
b[20 ] = add_round_shift_s16(b[20 ]);
b[21 ] = add_round_shift_s16(b[21 ]);
b[22 ] = add_round_shift_s16(b[22 ]);
b[23 ] = add_round_shift_s16(b[23 ]);
b[24 ] = add_round_shift_s16(b[24 ]);
b[25 ] = add_round_shift_s16(b[25 ]);
b[26 ] = add_round_shift_s16(b[26 ]);
b[27 ] = add_round_shift_s16(b[27 ]);
b[28 ] = add_round_shift_s16(a[28 ]);
b[29 ] = add_round_shift_s16(a[29 ]);
b[30 ] = add_round_shift_s16(a[30 ]);
b[31 ] = add_round_shift_s16(a[31 ]);
// Stage 3.
a[0 ] = vaddq_s16(b[0 ], b[7 ]);
a[1 ] = vaddq_s16(b[1 ], b[6 ]);
a[2 ] = vaddq_s16(b[2 ], b[5 ]);
a[3 ] = vaddq_s16(b[3 ], b[4 ]);
a[4 ] = vsubq_s16(b[3 ], b[4 ]);
a[5 ] = vsubq_s16(b[2 ], b[5 ]);
a[6 ] = vsubq_s16(b[1 ], b[6 ]);
a[7 ] = vsubq_s16(b[0 ], b[7 ]);
a[8 ] = b[8 ];
a[9 ] = b[9 ];
butterfly_one_coeff_s16_s32_narrow(b[13 ], b[10 ], cospi_16_64, &a[13 ], &a[10 ]);
butterfly_one_coeff_s16_s32_narrow(b[12 ], b[11 ], cospi_16_64, &a[12 ], &a[11 ]);
a[14 ] = b[14 ];
a[15 ] = b[15 ];
a[16 ] = vaddq_s16(b[16 ], b[23 ]);
a[17 ] = vaddq_s16(b[17 ], b[22 ]);
a[18 ] = vaddq_s16(b[18 ], b[21 ]);
a[19 ] = vaddq_s16(b[19 ], b[20 ]);
a[20 ] = vsubq_s16(b[19 ], b[20 ]);
a[21 ] = vsubq_s16(b[18 ], b[21 ]);
a[22 ] = vsubq_s16(b[17 ], b[22 ]);
a[23 ] = vsubq_s16(b[16 ], b[23 ]);
a[24 ] = vsubq_s16(b[31 ], b[24 ]);
a[25 ] = vsubq_s16(b[30 ], b[25 ]);
a[26 ] = vsubq_s16(b[29 ], b[26 ]);
a[27 ] = vsubq_s16(b[28 ], b[27 ]);
a[28 ] = vaddq_s16(b[28 ], b[27 ]);
a[29 ] = vaddq_s16(b[29 ], b[26 ]);
a[30 ] = vaddq_s16(b[30 ], b[25 ]);
a[31 ] = vaddq_s16(b[31 ], b[24 ]);
// Stage 4.
b[0 ] = vaddq_s16(a[0 ], a[3 ]);
b[1 ] = vaddq_s16(a[1 ], a[2 ]);
b[2 ] = vsubq_s16(a[1 ], a[2 ]);
b[3 ] = vsubq_s16(a[0 ], a[3 ]);
b[4 ] = a[4 ];
butterfly_one_coeff_s16_s32_narrow(a[6 ], a[5 ], cospi_16_64, &b[6 ], &b[5 ]);
b[7 ] = a[7 ];
b[8 ] = vaddq_s16(a[8 ], a[11 ]);
b[9 ] = vaddq_s16(a[9 ], a[10 ]);
b[10 ] = vsubq_s16(a[9 ], a[10 ]);
b[11 ] = vsubq_s16(a[8 ], a[11 ]);
b[12 ] = vsubq_s16(a[15 ], a[12 ]);
b[13 ] = vsubq_s16(a[14 ], a[13 ]);
b[14 ] = vaddq_s16(a[14 ], a[13 ]);
b[15 ] = vaddq_s16(a[15 ], a[12 ]);
b[16 ] = a[16 ];
b[17 ] = a[17 ];
butterfly_two_coeff(a[29 ], a[18 ], cospi_8_64, cospi_24_64, &b[>29 ], &b[18 ]);
butterfly_two_coeff(a[28 ], a[19 ], cospi_8_64, cospi_24_64, &b[>28 ], &b[19 ]);
butterfly_two_coeff(a[27 ], a[20 ], cospi_24_64, -cospi_8_64, &b[27 ], &b[20 ]);
butterfly_two_coeff(a[26 ], a[21 ], cospi_24_64, -cospi_8_64, &b[26 ], &b[21 ]);
b[22 ] = a[22 ];
b[23 ] = a[23 ];
b[24 ] = a[24 ];
b[25 ] = a[25 ];
b[30 ] = a[30 ];
b[31 ] = a[31 ];
// Stage 5.
butterfly_one_coeff_s16_s32_narrow(b[0 ], b[1 ], cospi_16_64, &a[0 ], &a[1 ]);
butterfly_two_coeff(b[3 ], b[2 ], cospi_8_64, cospi_24_64, &a[2 ], &a[3 ]);
a[4 ] = vaddq_s16(b[4 ], b[5 ]);
a[5 ] = vsubq_s16(b[4 ], b[5 ]);
a[6 ] = vsubq_s16(b[7 ], b[6 ]);
a[7 ] = vaddq_s16(b[7 ], b[6 ]);
a[8 ] = b[8 ];
butterfly_two_coeff(b[14 ], b[9 ], cospi_8_64, cospi_24_64, &a[14 ], &a[9 ]);
butterfly_two_coeff(b[13 ], b[10 ], cospi_24_64, -cospi_8_64, &a[13 ], &a[10 ]);
a[11 ] = b[11 ];
a[12 ] = b[12 ];
a[15 ] = b[15 ];
a[16 ] = vaddq_s16(b[19 ], b[16 ]);
a[17 ] = vaddq_s16(b[18 ], b[17 ]);
a[18 ] = vsubq_s16(b[17 ], b[18 ]);
a[19 ] = vsubq_s16(b[16 ], b[19 ]);
a[20 ] = vsubq_s16(b[23 ], b[20 ]);
a[21 ] = vsubq_s16(b[22 ], b[21 ]);
a[22 ] = vaddq_s16(b[21 ], b[22 ]);
a[23 ] = vaddq_s16(b[20 ], b[23 ]);
a[24 ] = vaddq_s16(b[27 ], b[24 ]);
a[25 ] = vaddq_s16(b[26 ], b[25 ]);
a[26 ] = vsubq_s16(b[25 ], b[26 ]);
a[27 ] = vsubq_s16(b[24 ], b[27 ]);
a[28 ] = vsubq_s16(b[31 ], b[28 ]);
a[29 ] = vsubq_s16(b[30 ], b[29 ]);
a[30 ] = vaddq_s16(b[29 ], b[30 ]);
a[31 ] = vaddq_s16(b[28 ], b[31 ]);
// Stage 6.
b[0 ] = a[0 ];
b[1 ] = a[1 ];
b[2 ] = a[2 ];
b[3 ] = a[3 ];
butterfly_two_coeff(a[7 ], a[4 ], cospi_4_64, cospi_28_64, &b[4 ], &b[7 ]);
butterfly_two_coeff(a[6 ], a[5 ], cospi_20_64, cospi_12_64, &b[5 ], &b[6 ]);
b[8 ] = vaddq_s16(a[8 ], a[9 ]);
b[9 ] = vsubq_s16(a[8 ], a[9 ]);
b[10 ] = vsubq_s16(a[11 ], a[10 ]);
b[11 ] = vaddq_s16(a[11 ], a[10 ]);
b[12 ] = vaddq_s16(a[12 ], a[13 ]);
b[13 ] = vsubq_s16(a[12 ], a[13 ]);
b[14 ] = vsubq_s16(a[15 ], a[14 ]);
b[15 ] = vaddq_s16(a[15 ], a[14 ]);
b[16 ] = a[16 ];
b[19 ] = a[19 ];
b[20 ] = a[20 ];
b[23 ] = a[23 ];
b[24 ] = a[24 ];
b[27 ] = a[27 ];
b[28 ] = a[28 ];
b[31 ] = a[31 ];
butterfly_two_coeff(a[30 ], a[17 ], cospi_4_64, cospi_28_64, &b[>30 ], &b[17 ]);
butterfly_two_coeff(a[29 ], a[18 ], cospi_28_64, -cospi_4_64, &b[29 ], &b[18 ]);
butterfly_two_coeff(a[26 ], a[21 ], cospi_20_64, cospi_12_64, &b[26 ], &b[21 ]);
butterfly_two_coeff(a[25 ], a[22 ], cospi_12_64, -cospi_20_64, &b[25 ], &b[22 ]);
// Stage 7.
a[0 ] = b[0 ];
a[1 ] = b[1 ];
a[2 ] = b[2 ];
a[3 ] = b[3 ];
a[4 ] = b[4 ];
a[5 ] = b[5 ];
a[6 ] = b[6 ];
a[7 ] = b[7 ];
butterfly_two_coeff(b[15 ], b[8 ], cospi_2_64, cospi_30_64, &a[8 ], &a[15 ]);
butterfly_two_coeff(b[14 ], b[9 ], cospi_18_64, cospi_14_64, &a[>9 ], &a[14 ]);
butterfly_two_coeff(b[13 ], b[10 ], cospi_10_64, cospi_22_64, &a[10 ], &a[13 ]);
butterfly_two_coeff(b[12 ], b[11 ], cospi_26_64, cospi_6_64, &a[>11 ], &a[12 ]);
a[16 ] = vaddq_s16(b[16 ], b[17 ]);
a[17 ] = vsubq_s16(b[16 ], b[17 ]);
a[18 ] = vsubq_s16(b[19 ], b[18 ]);
a[19 ] = vaddq_s16(b[19 ], b[18 ]);
a[20 ] = vaddq_s16(b[20 ], b[21 ]);
a[21 ] = vsubq_s16(b[20 ], b[21 ]);
a[22 ] = vsubq_s16(b[23 ], b[22 ]);
a[23 ] = vaddq_s16(b[23 ], b[22 ]);
a[24 ] = vaddq_s16(b[24 ], b[25 ]);
a[25 ] = vsubq_s16(b[24 ], b[25 ]);
a[26 ] = vsubq_s16(b[27 ], b[26 ]);
a[27 ] = vaddq_s16(b[27 ], b[26 ]);
a[28 ] = vaddq_s16(b[28 ], b[29 ]);
a[29 ] = vsubq_s16(b[28 ], b[29 ]);
a[30 ] = vsubq_s16(b[31 ], b[30 ]);
a[31 ] = vaddq_s16(b[31 ], b[30 ]);
// Final stage.
out[0 ] = a[0 ];
out[16 ] = a[1 ];
out[8 ] = a[2 ];
out[24 ] = a[3 ];
out[4 ] = a[4 ];
out[20 ] = a[5 ];
out[12 ] = a[6 ];
out[28 ] = a[7 ];
out[2 ] = a[8 ];
out[18 ] = a[9 ];
out[10 ] = a[10 ];
out[26 ] = a[11 ];
out[6 ] = a[12 ];
out[22 ] = a[13 ];
out[14 ] = a[14 ];
out[30 ] = a[15 ];
butterfly_two_coeff(a[31 ], a[16 ], cospi_1_64, cospi_31_64, &out[>1 ], &out[31 ]);
butterfly_two_coeff(a[30 ], a[17 ], cospi_17_64, cospi_15_64, &out[17 ],
&out[15 ]);
butterfly_two_coeff(a[29 ], a[18 ], cospi_9_64, cospi_23_64, &out[>9 ], &out[23 ]);
butterfly_two_coeff(a[28 ], a[19 ], cospi_25_64, cospi_7_64, &out[>25 ], &out[7 ]);
butterfly_two_coeff(a[27 ], a[20 ], cospi_5_64, cospi_27_64, &out[>5 ], &out[27 ]);
butterfly_two_coeff(a[26 ], a[21 ], cospi_21_64, cospi_11_64, &out[21 ],
&out[11 ]);
butterfly_two_coeff(a[25 ], a[22 ], cospi_13_64, cospi_19_64, &out[13 ],
&out[19 ]);
butterfly_two_coeff(a[24 ], a[23 ], cospi_29_64, cospi_3_64, &out[>29 ], &out[3 ]);
}
#undef PASS_THROUGH
#undef ADD_S16_S32
#undef SUB_S16_S32
#undef ADDW_S16_S32
#undef SUBW_S16_S32
#undef ADD_S32
#undef SUB_S32
#undef BUTTERFLY_ONE_S16_S32
#undef BUTTERFLY_ONE_S32
#undef BUTTERFLY_TWO_S32
#if CONFIG_VP9_HIGHBITDEPTH
// Store 32 32x4 vectors, assuming stride == 32.
static INLINE void store32x32_s32(
tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
int i;
for (i = 0 ; i < 32 ; i++) {
vst1q_s32(a, l1[i]);
vst1q_s32(a + 4 , r1[i]);
vst1q_s32(a + 8 , l2[i]);
vst1q_s32(a + 12 , r2[i]);
vst1q_s32(a + 16 , l3[i]);
vst1q_s32(a + 20 , r3[i]);
vst1q_s32(a + 24 , l4[i]);
vst1q_s32(a + 28 , r4[i]);
a += 32 ;
}
}
static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
int32x4_t *left /*[32]*/,
int32x4_t *right /* [32] */) {
left[0 ] = vshll_n_s16(vget_low_s16(a[0 ]), 2 );
left[1 ] = vshll_n_s16(vget_low_s16(a[1 ]), 2 );
left[2 ] = vshll_n_s16(vget_low_s16(a[2 ]), 2 );
left[3 ] = vshll_n_s16(vget_low_s16(a[3 ]), 2 );
left[4 ] = vshll_n_s16(vget_low_s16(a[4 ]), 2 );
left[5 ] = vshll_n_s16(vget_low_s16(a[5 ]), 2 );
left[6 ] = vshll_n_s16(vget_low_s16(a[6 ]), 2 );
left[7 ] = vshll_n_s16(vget_low_s16(a[7 ]), 2 );
left[8 ] = vshll_n_s16(vget_low_s16(a[8 ]), 2 );
left[9 ] = vshll_n_s16(vget_low_s16(a[9 ]), 2 );
left[10 ] = vshll_n_s16(vget_low_s16(a[10 ]), 2 );
left[11 ] = vshll_n_s16(vget_low_s16(a[11 ]), 2 );
left[12 ] = vshll_n_s16(vget_low_s16(a[12 ]), 2 );
left[13 ] = vshll_n_s16(vget_low_s16(a[13 ]), 2 );
left[14 ] = vshll_n_s16(vget_low_s16(a[14 ]), 2 );
left[15 ] = vshll_n_s16(vget_low_s16(a[15 ]), 2 );
left[16 ] = vshll_n_s16(vget_low_s16(a[16 ]), 2 );
left[17 ] = vshll_n_s16(vget_low_s16(a[17 ]), 2 );
left[18 ] = vshll_n_s16(vget_low_s16(a[18 ]), 2 );
left[19 ] = vshll_n_s16(vget_low_s16(a[19 ]), 2 );
left[20 ] = vshll_n_s16(vget_low_s16(a[20 ]), 2 );
left[21 ] = vshll_n_s16(vget_low_s16(a[21 ]), 2 );
left[22 ] = vshll_n_s16(vget_low_s16(a[22 ]), 2 );
left[23 ] = vshll_n_s16(vget_low_s16(a[23 ]), 2 );
left[24 ] = vshll_n_s16(vget_low_s16(a[24 ]), 2 );
left[25 ] = vshll_n_s16(vget_low_s16(a[25 ]), 2 );
left[26 ] = vshll_n_s16(vget_low_s16(a[26 ]), 2 );
left[27 ] = vshll_n_s16(vget_low_s16(a[27 ]), 2 );
left[28 ] = vshll_n_s16(vget_low_s16(a[28 ]), 2 );
left[29 ] = vshll_n_s16(vget_low_s16(a[29 ]), 2 );
left[30 ] = vshll_n_s16(vget_low_s16(a[30 ]), 2 );
left[31 ] = vshll_n_s16(vget_low_s16(a[31 ]), 2 );
right[0 ] = vshll_n_s16(vget_high_s16(a[0 ]), 2 );
right[1 ] = vshll_n_s16(vget_high_s16(a[1 ]), 2 );
right[2 ] = vshll_n_s16(vget_high_s16(a[2 ]), 2 );
right[3 ] = vshll_n_s16(vget_high_s16(a[3 ]), 2 );
right[4 ] = vshll_n_s16(vget_high_s16(a[4 ]), 2 );
right[5 ] = vshll_n_s16(vget_high_s16(a[5 ]), 2 );
right[6 ] = vshll_n_s16(vget_high_s16(a[6 ]), 2 );
right[7 ] = vshll_n_s16(vget_high_s16(a[7 ]), 2 );
right[8 ] = vshll_n_s16(vget_high_s16(a[8 ]), 2 );
right[9 ] = vshll_n_s16(vget_high_s16(a[9 ]), 2 );
right[10 ] = vshll_n_s16(vget_high_s16(a[10 ]), 2 );
right[11 ] = vshll_n_s16(vget_high_s16(a[11 ]), 2 );
right[12 ] = vshll_n_s16(vget_high_s16(a[12 ]), 2 );
right[13 ] = vshll_n_s16(vget_high_s16(a[13 ]), 2 );
right[14 ] = vshll_n_s16(vget_high_s16(a[14 ]), 2 );
right[15 ] = vshll_n_s16(vget_high_s16(a[15 ]), 2 );
right[16 ] = vshll_n_s16(vget_high_s16(a[16 ]), 2 );
right[17 ] = vshll_n_s16(vget_high_s16(a[17 ]), 2 );
right[18 ] = vshll_n_s16(vget_high_s16(a[18 ]), 2 );
right[19 ] = vshll_n_s16(vget_high_s16(a[19 ]), 2 );
right[20 ] = vshll_n_s16(vget_high_s16(a[20 ]), 2 );
right[21 ] = vshll_n_s16(vget_high_s16(a[21 ]), 2 );
right[22 ] = vshll_n_s16(vget_high_s16(a[22 ]), 2 );
right[23 ] = vshll_n_s16(vget_high_s16(a[23 ]), 2 );
right[24 ] = vshll_n_s16(vget_high_s16(a[24 ]), 2 );
right[25 ] = vshll_n_s16(vget_high_s16(a[25 ]), 2 );
right[26 ] = vshll_n_s16(vget_high_s16(a[26 ]), 2 );
right[27 ] = vshll_n_s16(vget_high_s16(a[27 ]), 2 );
right[28 ] = vshll_n_s16(vget_high_s16(a[28 ]), 2 );
right[29 ] = vshll_n_s16(vget_high_s16(a[29 ]), 2 );
right[30 ] = vshll_n_s16(vget_high_s16(a[30 ]), 2 );
right[31 ] = vshll_n_s16(vget_high_s16(a[31 ]), 2 );
}
static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
int32x4_t *a_right /*[32]*/,
int32x4_t *b_left /*[32]*/,
int32x4_t *b_right /*[32]*/) {
// Stage 1. Done as part of the load for the first pass.
b_left[0 ] = vaddq_s32(a_left[0 ], a_left[31 ]);
b_left[1 ] = vaddq_s32(a_left[1 ], a_left[30 ]);
b_left[2 ] = vaddq_s32(a_left[2 ], a_left[29 ]);
b_left[3 ] = vaddq_s32(a_left[3 ], a_left[28 ]);
b_left[4 ] = vaddq_s32(a_left[4 ], a_left[27 ]);
b_left[5 ] = vaddq_s32(a_left[5 ], a_left[26 ]);
b_left[6 ] = vaddq_s32(a_left[6 ], a_left[25 ]);
b_left[7 ] = vaddq_s32(a_left[7 ], a_left[24 ]);
b_left[8 ] = vaddq_s32(a_left[8 ], a_left[23 ]);
b_left[9 ] = vaddq_s32(a_left[9 ], a_left[22 ]);
b_left[10 ] = vaddq_s32(a_left[10 ], a_left[21 ]);
b_left[11 ] = vaddq_s32(a_left[11 ], a_left[20 ]);
b_left[12 ] = vaddq_s32(a_left[12 ], a_left[19 ]);
b_left[13 ] = vaddq_s32(a_left[13 ], a_left[18 ]);
b_left[14 ] = vaddq_s32(a_left[14 ], a_left[17 ]);
b_left[15 ] = vaddq_s32(a_left[15 ], a_left[16 ]);
b_right[0 ] = vaddq_s32(a_right[0 ], a_right[31 ]);
b_right[1 ] = vaddq_s32(a_right[1 ], a_right[30 ]);
b_right[2 ] = vaddq_s32(a_right[2 ], a_right[29 ]);
b_right[3 ] = vaddq_s32(a_right[3 ], a_right[28 ]);
b_right[4 ] = vaddq_s32(a_right[4 ], a_right[27 ]);
b_right[5 ] = vaddq_s32(a_right[5 ], a_right[26 ]);
b_right[6 ] = vaddq_s32(a_right[6 ], a_right[25 ]);
b_right[7 ] = vaddq_s32(a_right[7 ], a_right[24 ]);
b_right[8 ] = vaddq_s32(a_right[8 ], a_right[23 ]);
b_right[9 ] = vaddq_s32(a_right[9 ], a_right[22 ]);
b_right[10 ] = vaddq_s32(a_right[10 ], a_right[21 ]);
b_right[11 ] = vaddq_s32(a_right[11 ], a_right[20 ]);
b_right[12 ] = vaddq_s32(a_right[12 ], a_right[19 ]);
b_right[13 ] = vaddq_s32(a_right[13 ], a_right[18 ]);
b_right[14 ] = vaddq_s32(a_right[14 ], a_right[17 ]);
b_right[15 ] = vaddq_s32(a_right[15 ], a_right[16 ]);
b_left[16 ] = vsubq_s32(a_left[15 ], a_left[16 ]);
b_left[17 ] = vsubq_s32(a_left[14 ], a_left[17 ]);
b_left[18 ] = vsubq_s32(a_left[13 ], a_left[18 ]);
b_left[19 ] = vsubq_s32(a_left[12 ], a_left[19 ]);
b_left[20 ] = vsubq_s32(a_left[11 ], a_left[20 ]);
b_left[21 ] = vsubq_s32(a_left[10 ], a_left[21 ]);
b_left[22 ] = vsubq_s32(a_left[9 ], a_left[22 ]);
b_left[23 ] = vsubq_s32(a_left[8 ], a_left[23 ]);
b_left[24 ] = vsubq_s32(a_left[7 ], a_left[24 ]);
b_left[25 ] = vsubq_s32(a_left[6 ], a_left[25 ]);
b_left[26 ] = vsubq_s32(a_left[5 ], a_left[26 ]);
b_left[27 ] = vsubq_s32(a_left[4 ], a_left[27 ]);
b_left[28 ] = vsubq_s32(a_left[3 ], a_left[28 ]);
b_left[29 ] = vsubq_s32(a_left[2 ], a_left[29 ]);
b_left[30 ] = vsubq_s32(a_left[1 ], a_left[30 ]);
b_left[31 ] = vsubq_s32(a_left[0 ], a_left[31 ]);
b_right[16 ] = vsubq_s32(a_right[15 ], a_right[16 ]);
b_right[17 ] = vsubq_s32(a_right[14 ], a_right[17 ]);
b_right[18 ] = vsubq_s32(a_right[13 ], a_right[18 ]);
b_right[19 ] = vsubq_s32(a_right[12 ], a_right[19 ]);
b_right[20 ] = vsubq_s32(a_right[11 ], a_right[20 ]);
b_right[21 ] = vsubq_s32(a_right[10 ], a_right[21 ]);
b_right[22 ] = vsubq_s32(a_right[9 ], a_right[22 ]);
b_right[23 ] = vsubq_s32(a_right[8 ], a_right[23 ]);
b_right[24 ] = vsubq_s32(a_right[7 ], a_right[24 ]);
b_right[25 ] = vsubq_s32(a_right[6 ], a_right[25 ]);
b_right[26 ] = vsubq_s32(a_right[5 ], a_right[26 ]);
b_right[27 ] = vsubq_s32(a_right[4 ], a_right[27 ]);
b_right[28 ] = vsubq_s32(a_right[3 ], a_right[28 ]);
b_right[29 ] = vsubq_s32(a_right[2 ], a_right[29 ]);
b_right[30 ] = vsubq_s32(a_right[1 ], a_right[30 ]);
b_right[31 ] = vsubq_s32(a_right[0 ], a_right[31 ]);
}
static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
int32x4_t *right /* [32] */) {
// Also compute partial rounding shift:
// output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
left[0 ] = add_round_shift_s32(left[0 ]);
left[1 ] = add_round_shift_s32(left[1 ]);
left[2 ] = add_round_shift_s32(left[2 ]);
left[3 ] = add_round_shift_s32(left[3 ]);
left[4 ] = add_round_shift_s32(left[4 ]);
left[5 ] = add_round_shift_s32(left[5 ]);
left[6 ] = add_round_shift_s32(left[6 ]);
left[7 ] = add_round_shift_s32(left[7 ]);
left[8 ] = add_round_shift_s32(left[8 ]);
left[9 ] = add_round_shift_s32(left[9 ]);
left[10 ] = add_round_shift_s32(left[10 ]);
left[11 ] = add_round_shift_s32(left[11 ]);
left[12 ] = add_round_shift_s32(left[12 ]);
left[13 ] = add_round_shift_s32(left[13 ]);
left[14 ] = add_round_shift_s32(left[14 ]);
left[15 ] = add_round_shift_s32(left[15 ]);
left[16 ] = add_round_shift_s32(left[16 ]);
left[17 ] = add_round_shift_s32(left[17 ]);
left[18 ] = add_round_shift_s32(left[18 ]);
left[19 ] = add_round_shift_s32(left[19 ]);
left[20 ] = add_round_shift_s32(left[20 ]);
left[21 ] = add_round_shift_s32(left[21 ]);
left[22 ] = add_round_shift_s32(left[22 ]);
left[23 ] = add_round_shift_s32(left[23 ]);
left[24 ] = add_round_shift_s32(left[24 ]);
left[25 ] = add_round_shift_s32(left[25 ]);
left[26 ] = add_round_shift_s32(left[26 ]);
left[27 ] = add_round_shift_s32(left[27 ]);
left[28 ] = add_round_shift_s32(left[28 ]);
left[29 ] = add_round_shift_s32(left[29 ]);
left[30 ] = add_round_shift_s32(left[30 ]);
left[31 ] = add_round_shift_s32(left[31 ]);
right[0 ] = add_round_shift_s32(right[0 ]);
right[1 ] = add_round_shift_s32(right[1 ]);
right[2 ] = add_round_shift_s32(right[2 ]);
right[3 ] = add_round_shift_s32(right[3 ]);
right[4 ] = add_round_shift_s32(right[4 ]);
right[5 ] = add_round_shift_s32(right[5 ]);
right[6 ] = add_round_shift_s32(right[6 ]);
right[7 ] = add_round_shift_s32(right[7 ]);
right[8 ] = add_round_shift_s32(right[8 ]);
right[9 ] = add_round_shift_s32(right[9 ]);
right[10 ] = add_round_shift_s32(right[10 ]);
right[11 ] = add_round_shift_s32(right[11 ]);
right[12 ] = add_round_shift_s32(right[12 ]);
right[13 ] = add_round_shift_s32(right[13 ]);
right[14 ] = add_round_shift_s32(right[14 ]);
right[15 ] = add_round_shift_s32(right[15 ]);
right[16 ] = add_round_shift_s32(right[16 ]);
right[17 ] = add_round_shift_s32(right[17 ]);
right[18 ] = add_round_shift_s32(right[18 ]);
right[19 ] = add_round_shift_s32(right[19 ]);
right[20 ] = add_round_shift_s32(right[20 ]);
right[21 ] = add_round_shift_s32(right[21 ]);
right[22 ] = add_round_shift_s32(right[22 ]);
right[23 ] = add_round_shift_s32(right[23 ]);
right[24 ] = add_round_shift_s32(right[24 ]);
right[25 ] = add_round_shift_s32(right[25 ]);
right[26 ] = add_round_shift_s32(right[26 ]);
right[27 ] = add_round_shift_s32(right[27 ]);
right[28 ] = add_round_shift_s32(right[28 ]);
right[29 ] = add_round_shift_s32(right[29 ]);
right[30 ] = add_round_shift_s32(right[30 ]);
right[31 ] = add_round_shift_s32(right[31 ]);
}
static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
int32x4_t *right /* [32] */) {
// Also compute partial rounding shift:
// output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
left[0 ] = sub_round_shift_s32(left[0 ]);
left[1 ] = sub_round_shift_s32(left[1 ]);
left[2 ] = sub_round_shift_s32(left[2 ]);
left[3 ] = sub_round_shift_s32(left[3 ]);
left[4 ] = sub_round_shift_s32(left[4 ]);
left[5 ] = sub_round_shift_s32(left[5 ]);
left[6 ] = sub_round_shift_s32(left[6 ]);
left[7 ] = sub_round_shift_s32(left[7 ]);
left[8 ] = sub_round_shift_s32(left[8 ]);
left[9 ] = sub_round_shift_s32(left[9 ]);
left[10 ] = sub_round_shift_s32(left[10 ]);
left[11 ] = sub_round_shift_s32(left[11 ]);
left[12 ] = sub_round_shift_s32(left[12 ]);
left[13 ] = sub_round_shift_s32(left[13 ]);
left[14 ] = sub_round_shift_s32(left[14 ]);
left[15 ] = sub_round_shift_s32(left[15 ]);
left[16 ] = sub_round_shift_s32(left[16 ]);
left[17 ] = sub_round_shift_s32(left[17 ]);
left[18 ] = sub_round_shift_s32(left[18 ]);
left[19 ] = sub_round_shift_s32(left[19 ]);
left[20 ] = sub_round_shift_s32(left[20 ]);
left[21 ] = sub_round_shift_s32(left[21 ]);
left[22 ] = sub_round_shift_s32(left[22 ]);
left[23 ] = sub_round_shift_s32(left[23 ]);
left[24 ] = sub_round_shift_s32(left[24 ]);
left[25 ] = sub_round_shift_s32(left[25 ]);
left[26 ] = sub_round_shift_s32(left[26 ]);
left[27 ] = sub_round_shift_s32(left[27 ]);
left[28 ] = sub_round_shift_s32(left[28 ]);
left[29 ] = sub_round_shift_s32(left[29 ]);
left[30 ] = sub_round_shift_s32(left[30 ]);
left[31 ] = sub_round_shift_s32(left[31 ]);
right[0 ] = sub_round_shift_s32(right[0 ]);
right[1 ] = sub_round_shift_s32(right[1 ]);
right[2 ] = sub_round_shift_s32(right[2 ]);
right[3 ] = sub_round_shift_s32(right[3 ]);
right[4 ] = sub_round_shift_s32(right[4 ]);
right[5 ] = sub_round_shift_s32(right[5 ]);
right[6 ] = sub_round_shift_s32(right[6 ]);
right[7 ] = sub_round_shift_s32(right[7 ]);
right[8 ] = sub_round_shift_s32(right[8 ]);
right[9 ] = sub_round_shift_s32(right[9 ]);
right[10 ] = sub_round_shift_s32(right[10 ]);
right[11 ] = sub_round_shift_s32(right[11 ]);
right[12 ] = sub_round_shift_s32(right[12 ]);
right[13 ] = sub_round_shift_s32(right[13 ]);
right[14 ] = sub_round_shift_s32(right[14 ]);
right[15 ] = sub_round_shift_s32(right[15 ]);
right[16 ] = sub_round_shift_s32(right[16 ]);
right[17 ] = sub_round_shift_s32(right[17 ]);
right[18 ] = sub_round_shift_s32(right[18 ]);
right[19 ] = sub_round_shift_s32(right[19 ]);
right[20 ] = sub_round_shift_s32(right[20 ]);
right[21 ] = sub_round_shift_s32(right[21 ]);
right[22 ] = sub_round_shift_s32(right[22 ]);
right[23 ] = sub_round_shift_s32(right[23 ]);
right[24 ] = sub_round_shift_s32(right[24 ]);
right[25 ] = sub_round_shift_s32(right[25 ]);
right[26 ] = sub_round_shift_s32(right[26 ]);
right[27 ] = sub_round_shift_s32(right[27 ]);
right[28 ] = sub_round_shift_s32(right[28 ]);
right[29 ] = sub_round_shift_s32(right[29 ]);
right[30 ] = sub_round_shift_s32(right[30 ]);
right[31 ] = sub_round_shift_s32(right[31 ]);
}
static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
int32x4_t *right /*32*/) {
int32x4_t al[32 ], ar[32 ];
int32x4_t bl[32 ], br[32 ];
// Stage 1: Done as part of the load.
// Stage 2.
// Mini cross. X the first 16 values and the middle 8 of the second half.
al[0 ] = vaddq_s32(left[0 ], left[15 ]);
ar[0 ] = vaddq_s32(right[0 ], right[15 ]);
al[1 ] = vaddq_s32(left[1 ], left[14 ]);
ar[1 ] = vaddq_s32(right[1 ], right[14 ]);
al[2 ] = vaddq_s32(left[2 ], left[13 ]);
ar[2 ] = vaddq_s32(right[2 ], right[13 ]);
al[3 ] = vaddq_s32(left[3 ], left[12 ]);
ar[3 ] = vaddq_s32(right[3 ], right[12 ]);
al[4 ] = vaddq_s32(left[4 ], left[11 ]);
ar[4 ] = vaddq_s32(right[4 ], right[11 ]);
al[5 ] = vaddq_s32(left[5 ], left[10 ]);
ar[5 ] = vaddq_s32(right[5 ], right[10 ]);
al[6 ] = vaddq_s32(left[6 ], left[9 ]);
ar[6 ] = vaddq_s32(right[6 ], right[9 ]);
al[7 ] = vaddq_s32(left[7 ], left[8 ]);
ar[7 ] = vaddq_s32(right[7 ], right[8 ]);
al[8 ] = vsubq_s32(left[7 ], left[8 ]);
ar[8 ] = vsubq_s32(right[7 ], right[8 ]);
al[9 ] = vsubq_s32(left[6 ], left[9 ]);
ar[9 ] = vsubq_s32(right[6 ], right[9 ]);
al[10 ] = vsubq_s32(left[5 ], left[10 ]);
ar[10 ] = vsubq_s32(right[5 ], right[10 ]);
al[11 ] = vsubq_s32(left[4 ], left[11 ]);
ar[11 ] = vsubq_s32(right[4 ], right[11 ]);
al[12 ] = vsubq_s32(left[3 ], left[12 ]);
ar[12 ] = vsubq_s32(right[3 ], right[12 ]);
al[13 ] = vsubq_s32(left[2 ], left[13 ]);
ar[13 ] = vsubq_s32(right[2 ], right[13 ]);
al[14 ] = vsubq_s32(left[1 ], left[14 ]);
ar[14 ] = vsubq_s32(right[1 ], right[14 ]);
al[15 ] = vsubq_s32(left[0 ], left[15 ]);
ar[15 ] = vsubq_s32(right[0 ], right[15 ]);
al[16 ] = left[16 ];
ar[16 ] = right[16 ];
al[17 ] = left[17 ];
ar[17 ] = right[17 ];
al[18 ] = left[18 ];
ar[18 ] = right[18 ];
al[19 ] = left[19 ];
ar[19 ] = right[19 ];
butterfly_one_coeff_s32_fast(left[27 ], right[27 ], left[20 ], right[20 ],
cospi_16_64, &al[27 ], &ar[27 ], &al[le='color: green'>20 ], &ar[20 ]);
butterfly_one_coeff_s32_fast(left[26 ], right[26 ], left[21 ], right[21 ],
cospi_16_64, &al[26 ], &ar[26 ], &al[le='color: green'>21 ], &ar[21 ]);
butterfly_one_coeff_s32_fast(left[25 ], right[25 ], left[22 ], right[22 ],
cospi_16_64, &al[25 ], &ar[25 ], &al[le='color: green'>22 ], &ar[22 ]);
butterfly_one_coeff_s32_fast(left[24 ], right[24 ], left[23 ], right[23 ],
cospi_16_64, &al[24 ], &ar[24 ], &al[le='color: green'>23 ], &ar[23 ]);
al[28 ] = left[28 ];
ar[28 ] = right[28 ];
al[29 ] = left[29 ];
ar[29 ] = right[29 ];
al[30 ] = left[30 ];
ar[30 ] = right[30 ];
al[31 ] = left[31 ];
ar[31 ] = right[31 ];
// Stage 3.
bl[0 ] = vaddq_s32(al[0 ], al[7 ]);
br[0 ] = vaddq_s32(ar[0 ], ar[7 ]);
bl[1 ] = vaddq_s32(al[1 ], al[6 ]);
br[1 ] = vaddq_s32(ar[1 ], ar[6 ]);
bl[2 ] = vaddq_s32(al[2 ], al[5 ]);
br[2 ] = vaddq_s32(ar[2 ], ar[5 ]);
bl[3 ] = vaddq_s32(al[3 ], al[4 ]);
br[3 ] = vaddq_s32(ar[3 ], ar[4 ]);
bl[4 ] = vsubq_s32(al[3 ], al[4 ]);
br[4 ] = vsubq_s32(ar[3 ], ar[4 ]);
bl[5 ] = vsubq_s32(al[2 ], al[5 ]);
br[5 ] = vsubq_s32(ar[2 ], ar[5 ]);
bl[6 ] = vsubq_s32(al[1 ], al[6 ]);
br[6 ] = vsubq_s32(ar[1 ], ar[6 ]);
bl[7 ] = vsubq_s32(al[0 ], al[7 ]);
br[7 ] = vsubq_s32(ar[0 ], ar[7 ]);
bl[8 ] = al[8 ];
br[8 ] = ar[8 ];
bl[9 ] = al[9 ];
br[9 ] = ar[9 ];
butterfly_one_coeff_s32_fast(al[13 ], ar[13 ], al[10 ], ar[10 ], cospi_16_64,
&bl[13 ], &br[13 ], &bl[10 ], &br[10 ]);
butterfly_one_coeff_s32_fast(al[12 ], ar[12 ], al[11 ], ar[11 ], cospi_16_64,
&bl[12 ], &br[12 ], &bl[11 ], &br[11 ]);
bl[14 ] = al[14 ];
br[14 ] = ar[14 ];
bl[15 ] = al[15 ];
br[15 ] = ar[15 ];
bl[16 ] = vaddq_s32(left[16 ], al[23 ]);
br[16 ] = vaddq_s32(right[16 ], ar[23 ]);
bl[17 ] = vaddq_s32(left[17 ], al[22 ]);
br[17 ] = vaddq_s32(right[17 ], ar[22 ]);
bl[18 ] = vaddq_s32(left[18 ], al[21 ]);
br[18 ] = vaddq_s32(right[18 ], ar[21 ]);
bl[19 ] = vaddq_s32(left[19 ], al[20 ]);
br[19 ] = vaddq_s32(right[19 ], ar[20 ]);
bl[20 ] = vsubq_s32(left[19 ], al[20 ]);
br[20 ] = vsubq_s32(right[19 ], ar[20 ]);
bl[21 ] = vsubq_s32(left[18 ], al[21 ]);
br[21 ] = vsubq_s32(right[18 ], ar[21 ]);
bl[22 ] = vsubq_s32(left[17 ], al[22 ]);
br[22 ] = vsubq_s32(right[17 ], ar[22 ]);
bl[23 ] = vsubq_s32(left[16 ], al[23 ]);
br[23 ] = vsubq_s32(right[16 ], ar[23 ]);
bl[24 ] = vsubq_s32(left[31 ], al[24 ]);
br[24 ] = vsubq_s32(right[31 ], ar[24 ]);
bl[25 ] = vsubq_s32(left[30 ], al[25 ]);
br[25 ] = vsubq_s32(right[30 ], ar[25 ]);
bl[26 ] = vsubq_s32(left[29 ], al[26 ]);
br[26 ] = vsubq_s32(right[29 ], ar[26 ]);
bl[27 ] = vsubq_s32(left[28 ], al[27 ]);
br[27 ] = vsubq_s32(right[28 ], ar[27 ]);
bl[28 ] = vaddq_s32(left[28 ], al[27 ]);
br[28 ] = vaddq_s32(right[28 ], ar[27 ]);
bl[29 ] = vaddq_s32(left[29 ], al[26 ]);
br[29 ] = vaddq_s32(right[29 ], ar[26 ]);
bl[30 ] = vaddq_s32(left[30 ], al[25 ]);
br[30 ] = vaddq_s32(right[30 ], ar[25 ]);
bl[31 ] = vaddq_s32(left[31 ], al[24 ]);
br[31 ] = vaddq_s32(right[31 ], ar[24 ]);
// Stage 4.
al[0 ] = vaddq_s32(bl[0 ], bl[3 ]);
ar[0 ] = vaddq_s32(br[0 ], br[3 ]);
al[1 ] = vaddq_s32(bl[1 ], bl[2 ]);
ar[1 ] = vaddq_s32(br[1 ], br[2 ]);
al[2 ] = vsubq_s32(bl[1 ], bl[2 ]);
ar[2 ] = vsubq_s32(br[1 ], br[2 ]);
al[3 ] = vsubq_s32(bl[0 ], bl[3 ]);
ar[3 ] = vsubq_s32(br[0 ], br[3 ]);
al[4 ] = bl[4 ];
ar[4 ] = br[4 ];
butterfly_one_coeff_s32_fast(bl[6 ], br[6 ], bl[5 ], br[5 ], cospi_16_64, &al[6 ],
&ar[6 ], &al[5 ], &ar[5 ]);
al[7 ] = bl[7 ];
ar[7 ] = br[7 ];
al[8 ] = vaddq_s32(bl[8 ], bl[11 ]);
ar[8 ] = vaddq_s32(br[8 ], br[11 ]);
al[9 ] = vaddq_s32(bl[9 ], bl[10 ]);
ar[9 ] = vaddq_s32(br[9 ], br[10 ]);
al[10 ] = vsubq_s32(bl[9 ], bl[10 ]);
ar[10 ] = vsubq_s32(br[9 ], br[10 ]);
al[11 ] = vsubq_s32(bl[8 ], bl[11 ]);
ar[11 ] = vsubq_s32(br[8 ], br[11 ]);
al[12 ] = vsubq_s32(bl[15 ], bl[12 ]);
ar[12 ] = vsubq_s32(br[15 ], br[12 ]);
al[13 ] = vsubq_s32(bl[14 ], bl[13 ]);
ar[13 ] = vsubq_s32(br[14 ], br[13 ]);
al[14 ] = vaddq_s32(bl[14 ], bl[13 ]);
ar[14 ] = vaddq_s32(br[14 ], br[13 ]);
al[15 ] = vaddq_s32(bl[15 ], bl[12 ]);
ar[15 ] = vaddq_s32(br[15 ], br[12 ]);
al[16 ] = bl[16 ];
ar[16 ] = br[16 ];
al[17 ] = bl[17 ];
ar[17 ] = br[17 ];
butterfly_two_coeff_s32_s64_narrow(bl[29 ], br[29 ], bl[18 ], br[18 ], cospi_8_64,
cospi_24_64, &al[29 ], &ar[29 ], &al[le='color: green'>18 ],
&ar[18 ]);
butterfly_two_coeff_s32_s64_narrow(bl[28 ], br[28 ], bl[19 ], br[19 ], cospi_8_64,
cospi_24_64, &al[28 ], &ar[28 ], &al[le='color: green'>19 ],
&ar[19 ]);
butterfly_two_coeff_s32_s64_narrow(bl[27 ], br[27 ], bl[20 ], br[20 ],
cospi_24_64, -cospi_8_64, &al[27 ], &ar[27 ],
&al[20 ], &ar[20 ]);
butterfly_two_coeff_s32_s64_narrow(bl[26 ], br[26 ], bl[21 ], br[21 ],
cospi_24_64, -cospi_8_64, &al[26 ], &ar[26 ],
&al[21 ], &ar[21 ]);
al[22 ] = bl[22 ];
ar[22 ] = br[22 ];
al[23 ] = bl[23 ];
ar[23 ] = br[23 ];
al[24 ] = bl[24 ];
ar[24 ] = br[24 ];
al[25 ] = bl[25 ];
ar[25 ] = br[25 ];
al[30 ] = bl[30 ];
ar[30 ] = br[30 ];
al[31 ] = bl[31 ];
ar[31 ] = br[31 ];
// Stage 5.
butterfly_one_coeff_s32_fast(al[0 ], ar[0 ], al[1 ], ar[1 ], cospi_16_64, &bl[0 ],
&br[0 ], &bl[1 ], &br[1 ]);
butterfly_two_coeff_s32_s64_narrow(al[3 ], ar[3 ], al[2 ], ar[2 ], cospi_8_64,
cospi_24_64, &bl[2 ], &br[2 ], &bl[='color: green'>3 ],
&br[3 ]);
bl[4 ] = vaddq_s32(al[4 ], al[5 ]);
br[4 ] = vaddq_s32(ar[4 ], ar[5 ]);
bl[5 ] = vsubq_s32(al[4 ], al[5 ]);
br[5 ] = vsubq_s32(ar[4 ], ar[5 ]);
bl[6 ] = vsubq_s32(al[7 ], al[6 ]);
br[6 ] = vsubq_s32(ar[7 ], ar[6 ]);
bl[7 ] = vaddq_s32(al[7 ], al[6 ]);
br[7 ] = vaddq_s32(ar[7 ], ar[6 ]);
bl[8 ] = al[8 ];
br[8 ] = ar[8 ];
butterfly_two_coeff_s32_s64_narrow(al[14 ], ar[14 ], al[9 ], ar[9 ], cospi_8_64,
cospi_24_64, &bl[14 ], &br[14 ], &bl[le='color: green'>9 ],
&br[9 ]);
butterfly_two_coeff_s32_s64_narrow(al[13 ], ar[13 ], al[10 ], ar[10 ],
cospi_24_64, -cospi_8_64, &bl[13 ], &br[13 ],
&bl[10 ], &br[10 ]);
bl[11 ] = al[11 ];
br[11 ] = ar[11 ];
bl[12 ] = al[12 ];
br[12 ] = ar[12 ];
bl[15 ] = al[15 ];
br[15 ] = ar[15 ];
bl[16 ] = vaddq_s32(al[19 ], al[16 ]);
br[16 ] = vaddq_s32(ar[19 ], ar[16 ]);
bl[17 ] = vaddq_s32(al[18 ], al[17 ]);
br[17 ] = vaddq_s32(ar[18 ], ar[17 ]);
bl[18 ] = vsubq_s32(al[17 ], al[18 ]);
br[18 ] = vsubq_s32(ar[17 ], ar[18 ]);
bl[19 ] = vsubq_s32(al[16 ], al[19 ]);
br[19 ] = vsubq_s32(ar[16 ], ar[19 ]);
bl[20 ] = vsubq_s32(al[23 ], al[20 ]);
br[20 ] = vsubq_s32(ar[23 ], ar[20 ]);
bl[21 ] = vsubq_s32(al[22 ], al[21 ]);
br[21 ] = vsubq_s32(ar[22 ], ar[21 ]);
bl[22 ] = vaddq_s32(al[21 ], al[22 ]);
br[22 ] = vaddq_s32(ar[21 ], ar[22 ]);
bl[23 ] = vaddq_s32(al[20 ], al[23 ]);
br[23 ] = vaddq_s32(ar[20 ], ar[23 ]);
bl[24 ] = vaddq_s32(al[27 ], al[24 ]);
br[24 ] = vaddq_s32(ar[27 ], ar[24 ]);
bl[25 ] = vaddq_s32(al[26 ], al[25 ]);
br[25 ] = vaddq_s32(ar[26 ], ar[25 ]);
bl[26 ] = vsubq_s32(al[25 ], al[26 ]);
br[26 ] = vsubq_s32(ar[25 ], ar[26 ]);
bl[27 ] = vsubq_s32(al[24 ], al[27 ]);
br[27 ] = vsubq_s32(ar[24 ], ar[27 ]);
bl[28 ] = vsubq_s32(al[31 ], al[28 ]);
br[28 ] = vsubq_s32(ar[31 ], ar[28 ]);
bl[29 ] = vsubq_s32(al[30 ], al[29 ]);
br[29 ] = vsubq_s32(ar[30 ], ar[29 ]);
bl[30 ] = vaddq_s32(al[29 ], al[30 ]);
br[30 ] = vaddq_s32(ar[29 ], ar[30 ]);
bl[31 ] = vaddq_s32(al[28 ], al[31 ]);
br[31 ] = vaddq_s32(ar[28 ], ar[31 ]);
// Stage 6.
al[0 ] = bl[0 ];
ar[0 ] = br[0 ];
al[1 ] = bl[1 ];
ar[1 ] = br[1 ];
al[2 ] = bl[2 ];
ar[2 ] = br[2 ];
al[3 ] = bl[3 ];
ar[3 ] = br[3 ];
butterfly_two_coeff_s32_s64_narrow(bl[7 ], br[7 ], bl[4 ], br[4 ], cospi_4_64,
cospi_28_64, &al[4 ], &ar[4 ], &al[='color: green'>7 ],
&ar[7 ]);
butterfly_two_coeff_s32_s64_narrow(bl[6 ], br[6 ], bl[5 ], br[5 ], cospi_20_64,
cospi_12_64, &al[5 ], &ar[5 ], &al[='color: green'>6 ],
&ar[6 ]);
al[8 ] = vaddq_s32(bl[8 ], bl[9 ]);
ar[8 ] = vaddq_s32(br[8 ], br[9 ]);
al[9 ] = vsubq_s32(bl[8 ], bl[9 ]);
ar[9 ] = vsubq_s32(br[8 ], br[9 ]);
al[10 ] = vsubq_s32(bl[11 ], bl[10 ]);
ar[10 ] = vsubq_s32(br[11 ], br[10 ]);
al[11 ] = vaddq_s32(bl[11 ], bl[10 ]);
ar[11 ] = vaddq_s32(br[11 ], br[10 ]);
al[12 ] = vaddq_s32(bl[12 ], bl[13 ]);
ar[12 ] = vaddq_s32(br[12 ], br[13 ]);
al[13 ] = vsubq_s32(bl[12 ], bl[13 ]);
ar[13 ] = vsubq_s32(br[12 ], br[13 ]);
al[14 ] = vsubq_s32(bl[15 ], bl[14 ]);
ar[14 ] = vsubq_s32(br[15 ], br[14 ]);
al[15 ] = vaddq_s32(bl[15 ], bl[14 ]);
ar[15 ] = vaddq_s32(br[15 ], br[14 ]);
al[16 ] = bl[16 ];
ar[16 ] = br[16 ];
al[19 ] = bl[19 ];
ar[19 ] = br[19 ];
al[20 ] = bl[20 ];
ar[20 ] = br[20 ];
al[23 ] = bl[23 ];
ar[23 ] = br[23 ];
al[24 ] = bl[24 ];
ar[24 ] = br[24 ];
al[27 ] = bl[27 ];
ar[27 ] = br[27 ];
al[28 ] = bl[28 ];
ar[28 ] = br[28 ];
al[31 ] = bl[31 ];
ar[31 ] = br[31 ];
butterfly_two_coeff_s32_s64_narrow(bl[30 ], br[30 ], bl[17 ], br[17 ], cospi_4_64,
cospi_28_64, &al[30 ], &ar[30 ], &al[le='color: green'>17 ],
&ar[17 ]);
butterfly_two_coeff_s32_s64_narrow(bl[29 ], br[29 ], bl[18 ], br[18 ],
cospi_28_64, -cospi_4_64, &al[29 ], &ar[29 ],
&al[18 ], &ar[18 ]);
butterfly_two_coeff_s32_s64_narrow(bl[26 ], br[26 ], bl[21 ], br[21 ],
cospi_20_64, cospi_12_64, &al[26 ], &ar[26 ],
&al[21 ], &ar[21 ]);
butterfly_two_coeff_s32_s64_narrow(bl[25 ], br[25 ], bl[22 ], br[22 ],
cospi_12_64, -cospi_20_64, &al[25 ],
&ar[25 ], &al[22 ], &ar[22 ]);
// Stage 7.
bl[0 ] = al[0 ];
br[0 ] = ar[0 ];
bl[1 ] = al[1 ];
br[1 ] = ar[1 ];
bl[2 ] = al[2 ];
br[2 ] = ar[2 ];
bl[3 ] = al[3 ];
br[3 ] = ar[3 ];
bl[4 ] = al[4 ];
br[4 ] = ar[4 ];
bl[5 ] = al[5 ];
br[5 ] = ar[5 ];
bl[6 ] = al[6 ];
br[6 ] = ar[6 ];
bl[7 ] = al[7 ];
br[7 ] = ar[7 ];
butterfly_two_coeff_s32_s64_narrow(al[15 ], ar[15 ], al[8 ], ar[8 ], cospi_2_64,
cospi_30_64, &bl[8 ], &br[8 ], &bl[='color: green'>15 ],
&br[15 ]);
butterfly_two_coeff_s32_s64_narrow(al[14 ], ar[14 ], al[9 ], ar[9 ], cospi_18_64,
cospi_14_64, &bl[9 ], &br[9 ], &bl[='color: green'>14 ],
&br[14 ]);
butterfly_two_coeff_s32_s64_narrow(al[13 ], ar[13 ], al[10 ], ar[10 ],
cospi_10_64, cospi_22_64, &bl[10 ], &br[10 ],
&bl[13 ], &br[13 ]);
butterfly_two_coeff_s32_s64_narrow(al[12 ], ar[12 ], al[11 ], ar[11 ],
cospi_26_64, cospi_6_64, &bl[11 ], &br[11 ],
&bl[12 ], &br[12 ]);
bl[16 ] = vaddq_s32(al[16 ], al[17 ]);
br[16 ] = vaddq_s32(ar[16 ], ar[17 ]);
bl[17 ] = vsubq_s32(al[16 ], al[17 ]);
br[17 ] = vsubq_s32(ar[16 ], ar[17 ]);
bl[18 ] = vsubq_s32(al[19 ], al[18 ]);
br[18 ] = vsubq_s32(ar[19 ], ar[18 ]);
bl[19 ] = vaddq_s32(al[19 ], al[18 ]);
br[19 ] = vaddq_s32(ar[19 ], ar[18 ]);
bl[20 ] = vaddq_s32(al[20 ], al[21 ]);
br[20 ] = vaddq_s32(ar[20 ], ar[21 ]);
bl[21 ] = vsubq_s32(al[20 ], al[21 ]);
br[21 ] = vsubq_s32(ar[20 ], ar[21 ]);
bl[22 ] = vsubq_s32(al[23 ], al[22 ]);
br[22 ] = vsubq_s32(ar[23 ], ar[22 ]);
bl[23 ] = vaddq_s32(al[23 ], al[22 ]);
br[23 ] = vaddq_s32(ar[23 ], ar[22 ]);
bl[24 ] = vaddq_s32(al[24 ], al[25 ]);
br[24 ] = vaddq_s32(ar[24 ], ar[25 ]);
bl[25 ] = vsubq_s32(al[24 ], al[25 ]);
br[25 ] = vsubq_s32(ar[24 ], ar[25 ]);
bl[26 ] = vsubq_s32(al[27 ], al[26 ]);
br[26 ] = vsubq_s32(ar[27 ], ar[26 ]);
bl[27 ] = vaddq_s32(al[27 ], al[26 ]);
br[27 ] = vaddq_s32(ar[27 ], ar[26 ]);
bl[28 ] = vaddq_s32(al[28 ], al[29 ]);
br[28 ] = vaddq_s32(ar[28 ], ar[29 ]);
bl[29 ] = vsubq_s32(al[28 ], al[29 ]);
br[29 ] = vsubq_s32(ar[28 ], ar[29 ]);
bl[30 ] = vsubq_s32(al[31 ], al[30 ]);
br[30 ] = vsubq_s32(ar[31 ], ar[30 ]);
bl[31 ] = vaddq_s32(al[31 ], al[30 ]);
br[31 ] = vaddq_s32(ar[31 ], ar[30 ]);
// Final stage.
left[0 ] = bl[0 ];
right[0 ] = br[0 ];
left[16 ] = bl[1 ];
right[16 ] = br[1 ];
left[8 ] = bl[2 ];
right[8 ] = br[2 ];
left[24 ] = bl[3 ];
right[24 ] = br[3 ];
left[4 ] = bl[4 ];
right[4 ] = br[4 ];
left[20 ] = bl[5 ];
right[20 ] = br[5 ];
left[12 ] = bl[6 ];
right[12 ] = br[6 ];
left[28 ] = bl[7 ];
right[28 ] = br[7 ];
left[2 ] = bl[8 ];
right[2 ] = br[8 ];
left[18 ] = bl[9 ];
right[18 ] = br[9 ];
left[10 ] = bl[10 ];
right[10 ] = br[10 ];
left[26 ] = bl[11 ];
right[26 ] = br[11 ];
left[6 ] = bl[12 ];
right[6 ] = br[12 ];
left[22 ] = bl[13 ];
right[22 ] = br[13 ];
left[14 ] = bl[14 ];
right[14 ] = br[14 ];
left[30 ] = bl[15 ];
right[30 ] = br[15 ];
butterfly_two_coeff_s32_s64_narrow(bl[31 ], br[31 ], bl[16 ], br[16 ], cospi_1_64,
cospi_31_64, &al[1 ], &ar[1 ], &al[='color: green'>31 ],
&ar[31 ]);
left[1 ] = al[1 ];
right[1 ] = ar[1 ];
left[31 ] = al[31 ];
right[31 ] = ar[31 ];
butterfly_two_coeff_s32_s64_narrow(bl[30 ], br[30 ], bl[17 ], br[17 ],
cospi_17_64, cospi_15_64, &al[17 ], &ar[17 ],
&al[15 ], &ar[15 ]);
left[17 ] = al[17 ];
right[17 ] = ar[17 ];
left[15 ] = al[15 ];
right[15 ] = ar[15 ];
butterfly_two_coeff_s32_s64_narrow(bl[29 ], br[29 ], bl[18 ], br[18 ], cospi_9_64,
cospi_23_64, &al[9 ], &ar[9 ], &al[='color: green'>23 ],
&ar[23 ]);
left[9 ] = al[9 ];
right[9 ] = ar[9 ];
left[23 ] = al[23 ];
right[23 ] = ar[23 ];
butterfly_two_coeff_s32_s64_narrow(bl[28 ], br[28 ], bl[19 ], br[19 ],
cospi_25_64, cospi_7_64, &al[25 ], &ar[25 ],
&al[7 ], &ar[7 ]);
left[25 ] = al[25 ];
right[25 ] = ar[25 ];
left[7 ] = al[7 ];
right[7 ] = ar[7 ];
butterfly_two_coeff_s32_s64_narrow(bl[27 ], br[27 ], bl[20 ], br[20 ], cospi_5_64,
cospi_27_64, &al[5 ], &ar[5 ], &al[='color: green'>27 ],
&ar[27 ]);
left[5 ] = al[5 ];
right[5 ] = ar[5 ];
left[27 ] = al[27 ];
right[27 ] = ar[27 ];
butterfly_two_coeff_s32_s64_narrow(bl[26 ], br[26 ], bl[21 ], br[21 ],
cospi_21_64, cospi_11_64, &al[21 ], &ar[21 ],
&al[11 ], &ar[11 ]);
left[21 ] = al[21 ];
right[21 ] = ar[21 ];
left[11 ] = al[11 ];
right[11 ] = ar[11 ];
butterfly_two_coeff_s32_s64_narrow(bl[25 ], br[25 ], bl[22 ], br[22 ],
cospi_13_64, cospi_19_64, &al[13 ], &ar[13 ],
&al[19 ], &ar[19 ]);
left[13 ] = al[13 ];
right[13 ] = ar[13 ];
left[19 ] = al[19 ];
right[19 ] = ar[19 ];
butterfly_two_coeff_s32_s64_narrow(bl[24 ], br[24 ], bl[23 ], br[23 ],
cospi_29_64, cospi_3_64, &al[29 ], &ar[29 ],
&al[3 ], &ar[3 ]);
left[29 ] = al[29 ];
right[29 ] = ar[29 ];
left[3 ] = al[3 ];
right[3 ] = ar[3 ];
}
static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
int32x4_t *right /*32*/) {
int32x4_t al[32 ], ar[32 ];
int32x4_t bl[32 ], br[32 ];
// Stage 1: Done as part of the load.
// Stage 2.
// Mini cross. X the first 16 values and the middle 8 of the second half.
al[0 ] = vaddq_s32(left[0 ], left[15 ]);
ar[0 ] = vaddq_s32(right[0 ], right[15 ]);
al[1 ] = vaddq_s32(left[1 ], left[14 ]);
ar[1 ] = vaddq_s32(right[1 ], right[14 ]);
al[2 ] = vaddq_s32(left[2 ], left[13 ]);
ar[2 ] = vaddq_s32(right[2 ], right[13 ]);
al[3 ] = vaddq_s32(left[3 ], left[12 ]);
ar[3 ] = vaddq_s32(right[3 ], right[12 ]);
al[4 ] = vaddq_s32(left[4 ], left[11 ]);
ar[4 ] = vaddq_s32(right[4 ], right[11 ]);
al[5 ] = vaddq_s32(left[5 ], left[10 ]);
ar[5 ] = vaddq_s32(right[5 ], right[10 ]);
al[6 ] = vaddq_s32(left[6 ], left[9 ]);
ar[6 ] = vaddq_s32(right[6 ], right[9 ]);
al[7 ] = vaddq_s32(left[7 ], left[8 ]);
ar[7 ] = vaddq_s32(right[7 ], right[8 ]);
al[8 ] = vsubq_s32(left[7 ], left[8 ]);
ar[8 ] = vsubq_s32(right[7 ], right[8 ]);
al[9 ] = vsubq_s32(left[6 ], left[9 ]);
ar[9 ] = vsubq_s32(right[6 ], right[9 ]);
al[10 ] = vsubq_s32(left[5 ], left[10 ]);
ar[10 ] = vsubq_s32(right[5 ], right[10 ]);
al[11 ] = vsubq_s32(left[4 ], left[11 ]);
ar[11 ] = vsubq_s32(right[4 ], right[11 ]);
al[12 ] = vsubq_s32(left[3 ], left[12 ]);
ar[12 ] = vsubq_s32(right[3 ], right[12 ]);
al[13 ] = vsubq_s32(left[2 ], left[13 ]);
ar[13 ] = vsubq_s32(right[2 ], right[13 ]);
al[14 ] = vsubq_s32(left[1 ], left[14 ]);
ar[14 ] = vsubq_s32(right[1 ], right[14 ]);
al[15 ] = vsubq_s32(left[0 ], left[15 ]);
ar[15 ] = vsubq_s32(right[0 ], right[15 ]);
al[16 ] = left[16 ];
ar[16 ] = right[16 ];
al[17 ] = left[17 ];
ar[17 ] = right[17 ];
al[18 ] = left[18 ];
ar[18 ] = right[18 ];
al[19 ] = left[19 ];
ar[19 ] = right[19 ];
butterfly_one_coeff_s32_fast(left[27 ], right[27 ], left[20 ], right[20 ],
cospi_16_64, &al[27 ], &ar[27 ], &al[le='color: green'>20 ], &ar[20 ]);
butterfly_one_coeff_s32_fast(left[26 ], right[26 ], left[21 ], right[21 ],
cospi_16_64, &al[26 ], &ar[26 ], &al[le='color: green'>21 ], &ar[21 ]);
butterfly_one_coeff_s32_fast(left[25 ], right[25 ], left[22 ], right[22 ],
cospi_16_64, &al[25 ], &ar[25 ], &al[le='color: green'>22 ], &ar[22 ]);
butterfly_one_coeff_s32_fast(left[24 ], right[24 ], left[23 ], right[23 ],
cospi_16_64, &al[24 ], &ar[24 ], &al[le='color: green'>23 ], &ar[23 ]);
al[28 ] = left[28 ];
ar[28 ] = right[28 ];
al[29 ] = left[29 ];
ar[29 ] = right[29 ];
al[30 ] = left[30 ];
ar[30 ] = right[30 ];
al[31 ] = left[31 ];
ar[31 ] = right[31 ];
// Stage 3.
bl[0 ] = vaddq_s32(al[0 ], al[7 ]);
br[0 ] = vaddq_s32(ar[0 ], ar[7 ]);
bl[1 ] = vaddq_s32(al[1 ], al[6 ]);
br[1 ] = vaddq_s32(ar[1 ], ar[6 ]);
bl[2 ] = vaddq_s32(al[2 ], al[5 ]);
br[2 ] = vaddq_s32(ar[2 ], ar[5 ]);
bl[3 ] = vaddq_s32(al[3 ], al[4 ]);
br[3 ] = vaddq_s32(ar[3 ], ar[4 ]);
bl[4 ] = vsubq_s32(al[3 ], al[4 ]);
br[4 ] = vsubq_s32(ar[3 ], ar[4 ]);
bl[5 ] = vsubq_s32(al[2 ], al[5 ]);
br[5 ] = vsubq_s32(ar[2 ], ar[5 ]);
bl[6 ] = vsubq_s32(al[1 ], al[6 ]);
br[6 ] = vsubq_s32(ar[1 ], ar[6 ]);
bl[7 ] = vsubq_s32(al[0 ], al[7 ]);
br[7 ] = vsubq_s32(ar[0 ], ar[7 ]);
bl[8 ] = al[8 ];
br[8 ] = ar[8 ];
bl[9 ] = al[9 ];
br[9 ] = ar[9 ];
butterfly_one_coeff_s32_fast(al[13 ], ar[13 ], al[10 ], ar[10 ], cospi_16_64,
&bl[13 ], &br[13 ], &bl[10 ], &br[10 ]);
butterfly_one_coeff_s32_fast(al[12 ], ar[12 ], al[11 ], ar[11 ], cospi_16_64,
&bl[12 ], &br[12 ], &bl[11 ], &br[11 ]);
bl[14 ] = al[14 ];
br[14 ] = ar[14 ];
bl[15 ] = al[15 ];
br[15 ] = ar[15 ];
bl[16 ] = vaddq_s32(left[16 ], al[23 ]);
br[16 ] = vaddq_s32(right[16 ], ar[23 ]);
bl[17 ] = vaddq_s32(left[17 ], al[22 ]);
br[17 ] = vaddq_s32(right[17 ], ar[22 ]);
bl[18 ] = vaddq_s32(left[18 ], al[21 ]);
br[18 ] = vaddq_s32(right[18 ], ar[21 ]);
bl[19 ] = vaddq_s32(left[19 ], al[20 ]);
br[19 ] = vaddq_s32(right[19 ], ar[20 ]);
bl[20 ] = vsubq_s32(left[19 ], al[20 ]);
br[20 ] = vsubq_s32(right[19 ], ar[20 ]);
bl[21 ] = vsubq_s32(left[18 ], al[21 ]);
br[21 ] = vsubq_s32(right[18 ], ar[21 ]);
bl[22 ] = vsubq_s32(left[17 ], al[22 ]);
br[22 ] = vsubq_s32(right[17 ], ar[22 ]);
bl[23 ] = vsubq_s32(left[16 ], al[23 ]);
br[23 ] = vsubq_s32(right[16 ], ar[23 ]);
bl[24 ] = vsubq_s32(left[31 ], al[24 ]);
br[24 ] = vsubq_s32(right[31 ], ar[24 ]);
bl[25 ] = vsubq_s32(left[30 ], al[25 ]);
br[25 ] = vsubq_s32(right[30 ], ar[25 ]);
bl[26 ] = vsubq_s32(left[29 ], al[26 ]);
br[26 ] = vsubq_s32(right[29 ], ar[26 ]);
bl[27 ] = vsubq_s32(left[28 ], al[27 ]);
br[27 ] = vsubq_s32(right[28 ], ar[27 ]);
bl[28 ] = vaddq_s32(left[28 ], al[27 ]);
br[28 ] = vaddq_s32(right[28 ], ar[27 ]);
bl[29 ] = vaddq_s32(left[29 ], al[26 ]);
br[29 ] = vaddq_s32(right[29 ], ar[26 ]);
bl[30 ] = vaddq_s32(left[30 ], al[25 ]);
br[30 ] = vaddq_s32(right[30 ], ar[25 ]);
bl[31 ] = vaddq_s32(left[31 ], al[24 ]);
br[31 ] = vaddq_s32(right[31 ], ar[24 ]);
// Stage 4.
al[0 ] = vaddq_s32(bl[0 ], bl[3 ]);
ar[0 ] = vaddq_s32(br[0 ], br[3 ]);
al[1 ] = vaddq_s32(bl[1 ], bl[2 ]);
ar[1 ] = vaddq_s32(br[1 ], br[2 ]);
al[2 ] = vsubq_s32(bl[1 ], bl[2 ]);
ar[2 ] = vsubq_s32(br[1 ], br[2 ]);
al[3 ] = vsubq_s32(bl[0 ], bl[3 ]);
ar[3 ] = vsubq_s32(br[0 ], br[3 ]);
al[4 ] = bl[4 ];
ar[4 ] = br[4 ];
butterfly_one_coeff_s32_fast(bl[6 ], br[6 ], bl[5 ], br[5 ], cospi_16_64, &al[6 ],
&ar[6 ], &al[5 ], &ar[5 ]);
al[7 ] = bl[7 ];
ar[7 ] = br[7 ];
al[8 ] = vaddq_s32(bl[8 ], bl[11 ]);
ar[8 ] = vaddq_s32(br[8 ], br[11 ]);
al[9 ] = vaddq_s32(bl[9 ], bl[10 ]);
ar[9 ] = vaddq_s32(br[9 ], br[10 ]);
al[10 ] = vsubq_s32(bl[9 ], bl[10 ]);
ar[10 ] = vsubq_s32(br[9 ], br[10 ]);
al[11 ] = vsubq_s32(bl[8 ], bl[11 ]);
ar[11 ] = vsubq_s32(br[8 ], br[11 ]);
al[12 ] = vsubq_s32(bl[15 ], bl[12 ]);
ar[12 ] = vsubq_s32(br[15 ], br[12 ]);
al[13 ] = vsubq_s32(bl[14 ], bl[13 ]);
ar[13 ] = vsubq_s32(br[14 ], br[13 ]);
al[14 ] = vaddq_s32(bl[14 ], bl[13 ]);
ar[14 ] = vaddq_s32(br[14 ], br[13 ]);
al[15 ] = vaddq_s32(bl[15 ], bl[12 ]);
ar[15 ] = vaddq_s32(br[15 ], br[12 ]);
al[16 ] = bl[16 ];
ar[16 ] = br[16 ];
al[17 ] = bl[17 ];
ar[17 ] = br[17 ];
butterfly_two_coeff_s32_s64_narrow(bl[29 ], br[29 ], bl[18 ], br[18 ], cospi_8_64,
cospi_24_64, &al[29 ], &ar[29 ], &al[le='color: green'>18 ],
&ar[18 ]);
butterfly_two_coeff_s32_s64_narrow(bl[28 ], br[28 ], bl[19 ], br[19 ], cospi_8_64,
cospi_24_64, &al[28 ], &ar[28 ], &al[le='color: green'>19 ],
&ar[19 ]);
butterfly_two_coeff_s32_s64_narrow(bl[27 ], br[27 ], bl[20 ], br[20 ],
cospi_24_64, -cospi_8_64, &al[27 ], &ar[27 ],
&al[20 ], &ar[20 ]);
butterfly_two_coeff_s32_s64_narrow(bl[26 ], br[26 ], bl[21 ], br[21 ],
cospi_24_64, -cospi_8_64, &al[26 ], &ar[26 ],
&al[21 ], &ar[21 ]);
al[22 ] = bl[22 ];
ar[22 ] = br[22 ];
al[23 ] = bl[23 ];
ar[23 ] = br[23 ];
al[24 ] = bl[24 ];
ar[24 ] = br[24 ];
al[25 ] = bl[25 ];
ar[25 ] = br[25 ];
al[30 ] = bl[30 ];
ar[30 ] = br[30 ];
al[31 ] = bl[31 ];
ar[31 ] = br[31 ];
// Stage 5.
butterfly_one_coeff_s32_fast(al[0 ], ar[0 ], al[1 ], ar[1 ], cospi_16_64, &bl[0 ],
&br[0 ], &bl[1 ], &br[1 ]);
butterfly_two_coeff_s32_s64_narrow(al[3 ], ar[3 ], al[2 ], ar[2 ], cospi_8_64,
cospi_24_64, &bl[2 ], &br[2 ], &bl[='color: green'>3 ],
&br[3 ]);
bl[4 ] = vaddq_s32(al[4 ], al[5 ]);
br[4 ] = vaddq_s32(ar[4 ], ar[5 ]);
bl[5 ] = vsubq_s32(al[4 ], al[5 ]);
br[5 ] = vsubq_s32(ar[4 ], ar[5 ]);
bl[6 ] = vsubq_s32(al[7 ], al[6 ]);
br[6 ] = vsubq_s32(ar[7 ], ar[6 ]);
bl[7 ] = vaddq_s32(al[7 ], al[6 ]);
br[7 ] = vaddq_s32(ar[7 ], ar[6 ]);
bl[8 ] = al[8 ];
br[8 ] = ar[8 ];
butterfly_two_coeff_s32_s64_narrow(al[14 ], ar[14 ], al[9 ], ar[9 ], cospi_8_64,
cospi_24_64, &bl[14 ], &br[14 ], &bl[le='color: green'>9 ],
&br[9 ]);
butterfly_two_coeff_s32_s64_narrow(al[13 ], ar[13 ], al[10 ], ar[10 ],
cospi_24_64, -cospi_8_64, &bl[13 ], &br[13 ],
&bl[10 ], &br[10 ]);
bl[11 ] = al[11 ];
br[11 ] = ar[11 ];
bl[12 ] = al[12 ];
br[12 ] = ar[12 ];
bl[15 ] = al[15 ];
br[15 ] = ar[15 ];
bl[16 ] = vaddq_s32(al[19 ], al[16 ]);
br[16 ] = vaddq_s32(ar[19 ], ar[16 ]);
bl[17 ] = vaddq_s32(al[18 ], al[17 ]);
br[17 ] = vaddq_s32(ar[18 ], ar[17 ]);
bl[18 ] = vsubq_s32(al[17 ], al[18 ]);
br[18 ] = vsubq_s32(ar[17 ], ar[18 ]);
bl[19 ] = vsubq_s32(al[16 ], al[19 ]);
br[19 ] = vsubq_s32(ar[16 ], ar[19 ]);
bl[20 ] = vsubq_s32(al[23 ], al[20 ]);
br[20 ] = vsubq_s32(ar[23 ], ar[20 ]);
bl[21 ] = vsubq_s32(al[22 ], al[21 ]);
br[21 ] = vsubq_s32(ar[22 ], ar[21 ]);
bl[22 ] = vaddq_s32(al[21 ], al[22 ]);
br[22 ] = vaddq_s32(ar[21 ], ar[22 ]);
bl[23 ] = vaddq_s32(al[20 ], al[23 ]);
br[23 ] = vaddq_s32(ar[20 ], ar[23 ]);
bl[24 ] = vaddq_s32(al[27 ], al[24 ]);
br[24 ] = vaddq_s32(ar[27 ], ar[24 ]);
bl[25 ] = vaddq_s32(al[26 ], al[25 ]);
br[25 ] = vaddq_s32(ar[26 ], ar[25 ]);
bl[26 ] = vsubq_s32(al[25 ], al[26 ]);
br[26 ] = vsubq_s32(ar[25 ], ar[26 ]);
bl[27 ] = vsubq_s32(al[24 ], al[27 ]);
br[27 ] = vsubq_s32(ar[24 ], ar[27 ]);
bl[28 ] = vsubq_s32(al[31 ], al[28 ]);
br[28 ] = vsubq_s32(ar[31 ], ar[28 ]);
bl[29 ] = vsubq_s32(al[30 ], al[29 ]);
br[29 ] = vsubq_s32(ar[30 ], ar[29 ]);
bl[30 ] = vaddq_s32(al[29 ], al[30 ]);
br[30 ] = vaddq_s32(ar[29 ], ar[30 ]);
bl[31 ] = vaddq_s32(al[28 ], al[31 ]);
br[31 ] = vaddq_s32(ar[28 ], ar[31 ]);
// Stage 6.
al[0 ] = bl[0 ];
ar[0 ] = br[0 ];
al[1 ] = bl[1 ];
ar[1 ] = br[1 ];
al[2 ] = bl[2 ];
ar[2 ] = br[2 ];
al[3 ] = bl[3 ];
ar[3 ] = br[3 ];
butterfly_two_coeff_s32_s64_narrow(bl[7 ], br[7 ], bl[4 ], br[4 ], cospi_4_64,
cospi_28_64, &al[4 ], &ar[4 ], &al[='color: green'>7 ],
&ar[7 ]);
butterfly_two_coeff_s32_s64_narrow(bl[6 ], br[6 ], bl[5 ], br[5 ], cospi_20_64,
cospi_12_64, &al[5 ], &ar[5 ], &al[='color: green'>6 ],
&ar[6 ]);
al[8 ] = vaddq_s32(bl[8 ], bl[9 ]);
ar[8 ] = vaddq_s32(br[8 ], br[9 ]);
al[9 ] = vsubq_s32(bl[8 ], bl[9 ]);
ar[9 ] = vsubq_s32(br[8 ], br[9 ]);
al[10 ] = vsubq_s32(bl[11 ], bl[10 ]);
ar[10 ] = vsubq_s32(br[11 ], br[10 ]);
al[11 ] = vaddq_s32(bl[11 ], bl[10 ]);
ar[11 ] = vaddq_s32(br[11 ], br[10 ]);
al[12 ] = vaddq_s32(bl[12 ], bl[13 ]);
ar[12 ] = vaddq_s32(br[12 ], br[13 ]);
al[13 ] = vsubq_s32(bl[12 ], bl[13 ]);
ar[13 ] = vsubq_s32(br[12 ], br[13 ]);
al[14 ] = vsubq_s32(bl[15 ], bl[14 ]);
ar[14 ] = vsubq_s32(br[15 ], br[14 ]);
al[15 ] = vaddq_s32(bl[15 ], bl[14 ]);
ar[15 ] = vaddq_s32(br[15 ], br[14 ]);
al[16 ] = bl[16 ];
ar[16 ] = br[16 ];
al[19 ] = bl[19 ];
ar[19 ] = br[19 ];
al[20 ] = bl[20 ];
ar[20 ] = br[20 ];
al[23 ] = bl[23 ];
ar[23 ] = br[23 ];
al[24 ] = bl[24 ];
ar[24 ] = br[24 ];
al[27 ] = bl[27 ];
ar[27 ] = br[27 ];
al[28 ] = bl[28 ];
ar[28 ] = br[28 ];
al[31 ] = bl[31 ];
ar[31 ] = br[31 ];
butterfly_two_coeff_s32_s64_narrow(bl[30 ], br[30 ], bl[17 ], br[17 ], cospi_4_64,
cospi_28_64, &al[30 ], &ar[30 ], &al[le='color: green'>17 ],
&ar[17 ]);
butterfly_two_coeff_s32_s64_narrow(bl[29 ], br[29 ], bl[18 ], br[18 ],
cospi_28_64, -cospi_4_64, &al[29 ], &ar[29 ],
&al[18 ], &ar[18 ]);
butterfly_two_coeff_s32_s64_narrow(bl[26 ], br[26 ], bl[21 ], br[21 ],
cospi_20_64, cospi_12_64, &al[26 ], &ar[26 ],
&al[21 ], &ar[21 ]);
butterfly_two_coeff_s32_s64_narrow(bl[25 ], br[25 ], bl[22 ], br[22 ],
cospi_12_64, -cospi_20_64, &al[25 ],
&ar[25 ], &al[22 ], &ar[22 ]);
// Stage 7.
bl[0 ] = al[0 ];
br[0 ] = ar[0 ];
bl[1 ] = al[1 ];
br[1 ] = ar[1 ];
bl[2 ] = al[2 ];
br[2 ] = ar[2 ];
bl[3 ] = al[3 ];
br[3 ] = ar[3 ];
bl[4 ] = al[4 ];
br[4 ] = ar[4 ];
bl[5 ] = al[5 ];
br[5 ] = ar[5 ];
bl[6 ] = al[6 ];
br[6 ] = ar[6 ];
bl[7 ] = al[7 ];
br[7 ] = ar[7 ];
butterfly_two_coeff_s32_s64_narrow(al[15 ], ar[15 ], al[8 ], ar[8 ], cospi_2_64,
cospi_30_64, &bl[8 ], &br[8 ], &bl[='color: green'>15 ],
&br[15 ]);
butterfly_two_coeff_s32_s64_narrow(al[14 ], ar[14 ], al[9 ], ar[9 ], cospi_18_64,
cospi_14_64, &bl[9 ], &br[9 ], &bl[='color: green'>14 ],
&br[14 ]);
butterfly_two_coeff_s32_s64_narrow(al[13 ], ar[13 ], al[10 ], ar[10 ],
cospi_10_64, cospi_22_64, &bl[10 ], &br[10 ],
&bl[13 ], &br[13 ]);
butterfly_two_coeff_s32_s64_narrow(al[12 ], ar[12 ], al[11 ], ar[11 ],
cospi_26_64, cospi_6_64, &bl[11 ], &br[11 ],
&bl[12 ], &br[12 ]);
bl[16 ] = vaddq_s32(al[16 ], al[17 ]);
br[16 ] = vaddq_s32(ar[16 ], ar[17 ]);
bl[17 ] = vsubq_s32(al[16 ], al[17 ]);
br[17 ] = vsubq_s32(ar[16 ], ar[17 ]);
bl[18 ] = vsubq_s32(al[19 ], al[18 ]);
br[18 ] = vsubq_s32(ar[19 ], ar[18 ]);
bl[19 ] = vaddq_s32(al[19 ], al[18 ]);
br[19 ] = vaddq_s32(ar[19 ], ar[18 ]);
bl[20 ] = vaddq_s32(al[20 ], al[21 ]);
br[20 ] = vaddq_s32(ar[20 ], ar[21 ]);
bl[21 ] = vsubq_s32(al[20 ], al[21 ]);
br[21 ] = vsubq_s32(ar[20 ], ar[21 ]);
bl[22 ] = vsubq_s32(al[23 ], al[22 ]);
br[22 ] = vsubq_s32(ar[23 ], ar[22 ]);
bl[23 ] = vaddq_s32(al[23 ], al[22 ]);
br[23 ] = vaddq_s32(ar[23 ], ar[22 ]);
bl[24 ] = vaddq_s32(al[24 ], al[25 ]);
br[24 ] = vaddq_s32(ar[24 ], ar[25 ]);
bl[25 ] = vsubq_s32(al[24 ], al[25 ]);
br[25 ] = vsubq_s32(ar[24 ], ar[25 ]);
bl[26 ] = vsubq_s32(al[27 ], al[26 ]);
br[26 ] = vsubq_s32(ar[27 ], ar[26 ]);
bl[27 ] = vaddq_s32(al[27 ], al[26 ]);
br[27 ] = vaddq_s32(ar[27 ], ar[26 ]);
bl[28 ] = vaddq_s32(al[28 ], al[29 ]);
br[28 ] = vaddq_s32(ar[28 ], ar[29 ]);
bl[29 ] = vsubq_s32(al[28 ], al[29 ]);
br[29 ] = vsubq_s32(ar[28 ], ar[29 ]);
bl[30 ] = vsubq_s32(al[31 ], al[30 ]);
br[30 ] = vsubq_s32(ar[31 ], ar[30 ]);
bl[31 ] = vaddq_s32(al[31 ], al[30 ]);
br[31 ] = vaddq_s32(ar[31 ], ar[30 ]);
// Final stage.
left[0 ] = bl[0 ];
right[0 ] = br[0 ];
left[16 ] = bl[1 ];
right[16 ] = br[1 ];
left[8 ] = bl[2 ];
right[8 ] = br[2 ];
left[24 ] = bl[3 ];
right[24 ] = br[3 ];
left[4 ] = bl[4 ];
right[4 ] = br[4 ];
left[20 ] = bl[5 ];
right[20 ] = br[5 ];
left[12 ] = bl[6 ];
right[12 ] = br[6 ];
left[28 ] = bl[7 ];
right[28 ] = br[7 ];
left[2 ] = bl[8 ];
right[2 ] = br[8 ];
left[18 ] = bl[9 ];
right[18 ] = br[9 ];
left[10 ] = bl[10 ];
right[10 ] = br[10 ];
left[26 ] = bl[11 ];
right[26 ] = br[11 ];
left[6 ] = bl[12 ];
right[6 ] = br[12 ];
left[22 ] = bl[13 ];
right[22 ] = br[13 ];
left[14 ] = bl[14 ];
right[14 ] = br[14 ];
left[30 ] = bl[15 ];
right[30 ] = br[15 ];
butterfly_two_coeff_s32_s64_narrow(bl[31 ], br[31 ], bl[16 ], br[16 ], cospi_1_64,
cospi_31_64, &al[1 ], &ar[1 ], &al[='color: green'>31 ],
&ar[31 ]);
left[1 ] = al[1 ];
right[1 ] = ar[1 ];
left[31 ] = al[31 ];
right[31 ] = ar[31 ];
butterfly_two_coeff_s32_s64_narrow(bl[30 ], br[30 ], bl[17 ], br[17 ],
cospi_17_64, cospi_15_64, &al[17 ], &ar[17 ],
&al[15 ], &ar[15 ]);
left[17 ] = al[17 ];
right[17 ] = ar[17 ];
left[15 ] = al[15 ];
right[15 ] = ar[15 ];
butterfly_two_coeff_s32_s64_narrow(bl[29 ], br[29 ], bl[18 ], br[18 ], cospi_9_64,
cospi_23_64, &al[9 ], &ar[9 ], &al[='color: green'>23 ],
&ar[23 ]);
left[9 ] = al[9 ];
right[9 ] = ar[9 ];
left[23 ] = al[23 ];
right[23 ] = ar[23 ];
butterfly_two_coeff_s32_s64_narrow(bl[28 ], br[28 ], bl[19 ], br[19 ],
cospi_25_64, cospi_7_64, &al[25 ], &ar[25 ],
&al[7 ], &ar[7 ]);
left[25 ] = al[25 ];
right[25 ] = ar[25 ];
left[7 ] = al[7 ];
right[7 ] = ar[7 ];
butterfly_two_coeff_s32_s64_narrow(bl[27 ], br[27 ], bl[20 ], br[20 ], cospi_5_64,
cospi_27_64, &al[5 ], &ar[5 ], &al[='color: green'>27 ],
&ar[27 ]);
left[5 ] = al[5 ];
right[5 ] = ar[5 ];
left[27 ] = al[27 ];
right[27 ] = ar[27 ];
butterfly_two_coeff_s32_s64_narrow(bl[26 ], br[26 ], bl[21 ], br[21 ],
cospi_21_64, cospi_11_64, &al[21 ], &ar[21 ],
&al[11 ], &ar[11 ]);
left[21 ] = al[21 ];
right[21 ] = ar[21 ];
left[11 ] = al[11 ];
right[11 ] = ar[11 ];
butterfly_two_coeff_s32_s64_narrow(bl[25 ], br[25 ], bl[22 ], br[22 ],
cospi_13_64, cospi_19_64, &al[13 ], &ar[13 ],
&al[19 ], &ar[19 ]);
left[13 ] = al[13 ];
right[13 ] = ar[13 ];
left[19 ] = al[19 ];
right[19 ] = ar[19 ];
butterfly_two_coeff_s32_s64_narrow(bl[24 ], br[24 ], bl[23 ], br[23 ],
cospi_29_64, cospi_3_64, &al[29 ], &ar[29 ],
&al[3 ], &ar[3 ]);
left[29 ] = al[29 ];
right[29 ] = ar[29 ];
left[3 ] = al[3 ];
right[3 ] = ar[3 ];
}
static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
int32x4_t *right /*32*/) {
int32x4_t al[32 ], ar[32 ];
int32x4_t bl[32 ], br[32 ];
// Stage 1: Done as part of the load.
// Stage 2.
// For the "rd" version, all the values are rounded down after stage 2 to keep
// the values in 16 bits.
al[0 ] = add_round_shift_s32(vaddq_s32(left[0 ], left[15 ]));
ar[0 ] = add_round_shift_s32(vaddq_s32(right[0 ], right[15 ]));
al[1 ] = add_round_shift_s32(vaddq_s32(left[1 ], left[14 ]));
ar[1 ] = add_round_shift_s32(vaddq_s32(right[1 ], right[14 ]));
al[2 ] = add_round_shift_s32(vaddq_s32(left[2 ], left[13 ]));
ar[2 ] = add_round_shift_s32(vaddq_s32(right[2 ], right[13 ]));
al[3 ] = add_round_shift_s32(vaddq_s32(left[3 ], left[12 ]));
ar[3 ] = add_round_shift_s32(vaddq_s32(right[3 ], right[12 ]));
al[4 ] = add_round_shift_s32(vaddq_s32(left[4 ], left[11 ]));
ar[4 ] = add_round_shift_s32(vaddq_s32(right[4 ], right[11 ]));
al[5 ] = add_round_shift_s32(vaddq_s32(left[5 ], left[10 ]));
ar[5 ] = add_round_shift_s32(vaddq_s32(right[5 ], right[10 ]));
al[6 ] = add_round_shift_s32(vaddq_s32(left[6 ], left[9 ]));
ar[6 ] = add_round_shift_s32(vaddq_s32(right[6 ], right[9 ]));
al[7 ] = add_round_shift_s32(vaddq_s32(left[7 ], left[8 ]));
ar[7 ] = add_round_shift_s32(vaddq_s32(right[7 ], right[8 ]));
al[8 ] = add_round_shift_s32(vsubq_s32(left[7 ], left[8 ]));
ar[8 ] = add_round_shift_s32(vsubq_s32(right[7 ], right[8 ]));
al[9 ] = add_round_shift_s32(vsubq_s32(left[6 ], left[9 ]));
ar[9 ] = add_round_shift_s32(vsubq_s32(right[6 ], right[9 ]));
al[10 ] = add_round_shift_s32(vsubq_s32(left[5 ], left[10 ]));
ar[10 ] = add_round_shift_s32(vsubq_s32(right[5 ], right[10 ]));
al[11 ] = add_round_shift_s32(vsubq_s32(left[4 ], left[11 ]));
ar[11 ] = add_round_shift_s32(vsubq_s32(right[4 ], right[11 ]));
al[12 ] = add_round_shift_s32(vsubq_s32(left[3 ], left[12 ]));
ar[12 ] = add_round_shift_s32(vsubq_s32(right[3 ], right[12 ]));
al[13 ] = add_round_shift_s32(vsubq_s32(left[2 ], left[13 ]));
ar[13 ] = add_round_shift_s32(vsubq_s32(right[2 ], right[13 ]));
al[14 ] = add_round_shift_s32(vsubq_s32(left[1 ], left[14 ]));
ar[14 ] = add_round_shift_s32(vsubq_s32(right[1 ], right[14 ]));
al[15 ] = add_round_shift_s32(vsubq_s32(left[0 ], left[15 ]));
ar[15 ] = add_round_shift_s32(vsubq_s32(right[0 ], right[15 ]));
al[16 ] = add_round_shift_s32(left[16 ]);
ar[16 ] = add_round_shift_s32(right[16 ]);
al[17 ] = add_round_shift_s32(left[17 ]);
ar[17 ] = add_round_shift_s32(right[17 ]);
al[18 ] = add_round_shift_s32(left[18 ]);
ar[18 ] = add_round_shift_s32(right[18 ]);
al[19 ] = add_round_shift_s32(left[19 ]);
ar[19 ] = add_round_shift_s32(right[19 ]);
butterfly_one_coeff_s32_fast(left[27 ], right[27 ], left[20 ], right[20 ],
cospi_16_64, &al[27 ], &ar[27 ], &al[le='color: green'>20 ], &ar[20 ]);
butterfly_one_coeff_s32_fast(left[26 ], right[26 ], left[21 ], right[21 ],
cospi_16_64, &al[26 ], &ar[26 ], &al[le='color: green'>21 ], &ar[21 ]);
butterfly_one_coeff_s32_fast(left[25 ], right[25 ], left[22 ], right[22 ],
cospi_16_64, &al[25 ], &ar[25 ], &al[le='color: green'>22 ], &ar[22 ]);
butterfly_one_coeff_s32_fast(left[24 ], right[24 ], left[23 ], right[23 ],
cospi_16_64, &al[24 ], &ar[24 ], &al[le='color: green'>23 ], &ar[23 ]);
al[20 ] = add_round_shift_s32(al[20 ]);
ar[20 ] = add_round_shift_s32(ar[20 ]);
al[21 ] = add_round_shift_s32(al[21 ]);
ar[21 ] = add_round_shift_s32(ar[21 ]);
al[22 ] = add_round_shift_s32(al[22 ]);
ar[22 ] = add_round_shift_s32(ar[22 ]);
al[23 ] = add_round_shift_s32(al[23 ]);
ar[23 ] = add_round_shift_s32(ar[23 ]);
al[24 ] = add_round_shift_s32(al[24 ]);
ar[24 ] = add_round_shift_s32(ar[24 ]);
al[25 ] = add_round_shift_s32(al[25 ]);
ar[25 ] = add_round_shift_s32(ar[25 ]);
al[26 ] = add_round_shift_s32(al[26 ]);
ar[26 ] = add_round_shift_s32(ar[26 ]);
al[27 ] = add_round_shift_s32(al[27 ]);
ar[27 ] = add_round_shift_s32(ar[27 ]);
al[28 ] = add_round_shift_s32(left[28 ]);
ar[28 ] = add_round_shift_s32(right[28 ]);
al[29 ] = add_round_shift_s32(left[29 ]);
ar[29 ] = add_round_shift_s32(right[29 ]);
al[30 ] = add_round_shift_s32(left[30 ]);
ar[30 ] = add_round_shift_s32(right[30 ]);
al[31 ] = add_round_shift_s32(left[31 ]);
ar[31 ] = add_round_shift_s32(right[31 ]);
// Stage 3.
bl[0 ] = vaddq_s32(al[0 ], al[7 ]);
br[0 ] = vaddq_s32(ar[0 ], ar[7 ]);
bl[1 ] = vaddq_s32(al[1 ], al[6 ]);
br[1 ] = vaddq_s32(ar[1 ], ar[6 ]);
bl[2 ] = vaddq_s32(al[2 ], al[5 ]);
br[2 ] = vaddq_s32(ar[2 ], ar[5 ]);
bl[3 ] = vaddq_s32(al[3 ], al[4 ]);
br[3 ] = vaddq_s32(ar[3 ], ar[4 ]);
bl[4 ] = vsubq_s32(al[3 ], al[4 ]);
br[4 ] = vsubq_s32(ar[3 ], ar[4 ]);
bl[5 ] = vsubq_s32(al[2 ], al[5 ]);
br[5 ] = vsubq_s32(ar[2 ], ar[5 ]);
bl[6 ] = vsubq_s32(al[1 ], al[6 ]);
br[6 ] = vsubq_s32(ar[1 ], ar[6 ]);
bl[7 ] = vsubq_s32(al[0 ], al[7 ]);
br[7 ] = vsubq_s32(ar[0 ], ar[7 ]);
bl[8 ] = al[8 ];
br[8 ] = ar[8 ];
bl[9 ] = al[9 ];
br[9 ] = ar[9 ];
butterfly_one_coeff_s32_fast(al[13 ], ar[13 ], al[10 ], ar[10 ], cospi_16_64,
&bl[13 ], &br[13 ], &bl[10 ], &br[10 ]);
butterfly_one_coeff_s32_fast(al[12 ], ar[12 ], al[11 ], ar[11 ], cospi_16_64,
&bl[12 ], &br[12 ], &bl[11 ], &br[11 ]);
bl[14 ] = al[14 ];
br[14 ] = ar[14 ];
bl[15 ] = al[15 ];
br[15 ] = ar[15 ];
bl[16 ] = vaddq_s32(al[16 ], al[23 ]);
br[16 ] = vaddq_s32(ar[16 ], ar[23 ]);
bl[17 ] = vaddq_s32(al[17 ], al[22 ]);
br[17 ] = vaddq_s32(ar[17 ], ar[22 ]);
bl[18 ] = vaddq_s32(al[18 ], al[21 ]);
br[18 ] = vaddq_s32(ar[18 ], ar[21 ]);
bl[19 ] = vaddq_s32(al[19 ], al[20 ]);
br[19 ] = vaddq_s32(ar[19 ], ar[20 ]);
bl[20 ] = vsubq_s32(al[19 ], al[20 ]);
br[20 ] = vsubq_s32(ar[19 ], ar[20 ]);
bl[21 ] = vsubq_s32(al[18 ], al[21 ]);
br[21 ] = vsubq_s32(ar[18 ], ar[21 ]);
bl[22 ] = vsubq_s32(al[17 ], al[22 ]);
br[22 ] = vsubq_s32(ar[17 ], ar[22 ]);
bl[23 ] = vsubq_s32(al[16 ], al[23 ]);
br[23 ] = vsubq_s32(ar[16 ], ar[23 ]);
bl[24 ] = vsubq_s32(al[31 ], al[24 ]);
br[24 ] = vsubq_s32(ar[31 ], ar[24 ]);
bl[25 ] = vsubq_s32(al[30 ], al[25 ]);
br[25 ] = vsubq_s32(ar[30 ], ar[25 ]);
bl[26 ] = vsubq_s32(al[29 ], al[26 ]);
br[26 ] = vsubq_s32(ar[29 ], ar[26 ]);
bl[27 ] = vsubq_s32(al[28 ], al[27 ]);
br[27 ] = vsubq_s32(ar[28 ], ar[27 ]);
bl[28 ] = vaddq_s32(al[28 ], al[27 ]);
br[28 ] = vaddq_s32(ar[28 ], ar[27 ]);
bl[29 ] = vaddq_s32(al[29 ], al[26 ]);
br[29 ] = vaddq_s32(ar[29 ], ar[26 ]);
bl[30 ] = vaddq_s32(al[30 ], al[25 ]);
br[30 ] = vaddq_s32(ar[30 ], ar[25 ]);
bl[31 ] = vaddq_s32(al[31 ], al[24 ]);
br[31 ] = vaddq_s32(ar[31 ], ar[24 ]);
// Stage 4.
al[0 ] = vaddq_s32(bl[0 ], bl[3 ]);
ar[0 ] = vaddq_s32(br[0 ], br[3 ]);
al[1 ] = vaddq_s32(bl[1 ], bl[2 ]);
ar[1 ] = vaddq_s32(br[1 ], br[2 ]);
al[2 ] = vsubq_s32(bl[1 ], bl[2 ]);
ar[2 ] = vsubq_s32(br[1 ], br[2 ]);
al[3 ] = vsubq_s32(bl[0 ], bl[3 ]);
ar[3 ] = vsubq_s32(br[0 ], br[3 ]);
al[4 ] = bl[4 ];
ar[4 ] = br[4 ];
butterfly_one_coeff_s32_fast(bl[6 ], br[6 ], bl[5 ], br[5 ], cospi_16_64, &al[6 ],
&ar[6 ], &al[5 ], &ar[5 ]);
al[7 ] = bl[7 ];
ar[7 ] = br[7 ];
al[8 ] = vaddq_s32(bl[8 ], bl[11 ]);
ar[8 ] = vaddq_s32(br[8 ], br[11 ]);
al[9 ] = vaddq_s32(bl[9 ], bl[10 ]);
ar[9 ] = vaddq_s32(br[9 ], br[10 ]);
al[10 ] = vsubq_s32(bl[9 ], bl[10 ]);
ar[10 ] = vsubq_s32(br[9 ], br[10 ]);
al[11 ] = vsubq_s32(bl[8 ], bl[11 ]);
ar[11 ] = vsubq_s32(br[8 ], br[11 ]);
al[12 ] = vsubq_s32(bl[15 ], bl[12 ]);
ar[12 ] = vsubq_s32(br[15 ], br[12 ]);
al[13 ] = vsubq_s32(bl[14 ], bl[13 ]);
ar[13 ] = vsubq_s32(br[14 ], br[13 ]);
al[14 ] = vaddq_s32(bl[14 ], bl[13 ]);
ar[14 ] = vaddq_s32(br[14 ], br[13 ]);
al[15 ] = vaddq_s32(bl[15 ], bl[12 ]);
ar[15 ] = vaddq_s32(br[15 ], br[12 ]);
al[16 ] = bl[16 ];
ar[16 ] = br[16 ];
al[17 ] = bl[17 ];
ar[17 ] = br[17 ];
butterfly_two_coeff_s32(bl[29 ], br[29 ], bl[18 ], br[18 ], cospi_8_64,
cospi_24_64, &al[29 ], &ar[29 ], &al[le='color: green'>18 ], &ar[18 ]);
butterfly_two_coeff_s32(bl[28 ], br[28 ], bl[19 ], br[19 ], cospi_8_64,
cospi_24_64, &al[28 ], &ar[28 ], &al[le='color: green'>19 ], &ar[19 ]);
butterfly_two_coeff_s32(bl[27 ], br[27 ], bl[20 ], br[20 ], cospi_24_64,
-cospi_8_64, &al[27 ], &ar[27 ], &al[le='color: green'>20 ], &ar[20 ]);
butterfly_two_coeff_s32(bl[26 ], br[26 ], bl[21 ], br[21 ], cospi_24_64,
-cospi_8_64, &al[26 ], &ar[26 ], &al[le='color: green'>21 ], &ar[21 ]);
al[22 ] = bl[22 ];
ar[22 ] = br[22 ];
al[23 ] = bl[23 ];
ar[23 ] = br[23 ];
al[24 ] = bl[24 ];
ar[24 ] = br[24 ];
al[25 ] = bl[25 ];
ar[25 ] = br[25 ];
al[30 ] = bl[30 ];
ar[30 ] = br[30 ];
al[31 ] = bl[31 ];
ar[31 ] = br[31 ];
// Stage 5.
butterfly_one_coeff_s32_fast(al[0 ], ar[0 ], al[1 ], ar[1 ], cospi_16_64, &bl[0 ],
&br[0 ], &bl[1 ], &br[1 ]);
butterfly_two_coeff_s32(al[3 ], ar[3 ], al[2 ], ar[2 ], cospi_8_64, cospi_24_64,
&bl[2 ], &br[2 ], &bl[3 ], &br[3 ]);
bl[4 ] = vaddq_s32(al[4 ], al[5 ]);
br[4 ] = vaddq_s32(ar[4 ], ar[5 ]);
bl[5 ] = vsubq_s32(al[4 ], al[5 ]);
br[5 ] = vsubq_s32(ar[4 ], ar[5 ]);
bl[6 ] = vsubq_s32(al[7 ], al[6 ]);
br[6 ] = vsubq_s32(ar[7 ], ar[6 ]);
bl[7 ] = vaddq_s32(al[7 ], al[6 ]);
br[7 ] = vaddq_s32(ar[7 ], ar[6 ]);
bl[8 ] = al[8 ];
br[8 ] = ar[8 ];
butterfly_two_coeff_s32(al[14 ], ar[14 ], al[9 ], ar[9 ], cospi_8_64, cospi_24_64,
&bl[14 ], &br[14 ], &bl[9 ], &br[9 ]);
butterfly_two_coeff_s32(al[13 ], ar[13 ], al[10 ], ar[10 ], cospi_24_64,
-cospi_8_64, &bl[13 ], &br[13 ], &bl[le='color: green'>10 ], &br[10 ]);
bl[11 ] = al[11 ];
br[11 ] = ar[11 ];
bl[12 ] = al[12 ];
br[12 ] = ar[12 ];
bl[15 ] = al[15 ];
br[15 ] = ar[15 ];
bl[16 ] = vaddq_s32(al[19 ], al[16 ]);
br[16 ] = vaddq_s32(ar[19 ], ar[16 ]);
bl[17 ] = vaddq_s32(al[18 ], al[17 ]);
br[17 ] = vaddq_s32(ar[18 ], ar[17 ]);
bl[18 ] = vsubq_s32(al[17 ], al[18 ]);
br[18 ] = vsubq_s32(ar[17 ], ar[18 ]);
bl[19 ] = vsubq_s32(al[16 ], al[19 ]);
br[19 ] = vsubq_s32(ar[16 ], ar[19 ]);
bl[20 ] = vsubq_s32(al[23 ], al[20 ]);
br[20 ] = vsubq_s32(ar[23 ], ar[20 ]);
bl[21 ] = vsubq_s32(al[22 ], al[21 ]);
br[21 ] = vsubq_s32(ar[22 ], ar[21 ]);
bl[22 ] = vaddq_s32(al[21 ], al[22 ]);
br[22 ] = vaddq_s32(ar[21 ], ar[22 ]);
bl[23 ] = vaddq_s32(al[20 ], al[23 ]);
br[23 ] = vaddq_s32(ar[20 ], ar[23 ]);
bl[24 ] = vaddq_s32(al[27 ], al[24 ]);
br[24 ] = vaddq_s32(ar[27 ], ar[24 ]);
bl[25 ] = vaddq_s32(al[26 ], al[25 ]);
br[25 ] = vaddq_s32(ar[26 ], ar[25 ]);
bl[26 ] = vsubq_s32(al[25 ], al[26 ]);
br[26 ] = vsubq_s32(ar[25 ], ar[26 ]);
bl[27 ] = vsubq_s32(al[24 ], al[27 ]);
br[27 ] = vsubq_s32(ar[24 ], ar[27 ]);
bl[28 ] = vsubq_s32(al[31 ], al[28 ]);
br[28 ] = vsubq_s32(ar[31 ], ar[28 ]);
bl[29 ] = vsubq_s32(al[30 ], al[29 ]);
br[29 ] = vsubq_s32(ar[30 ], ar[29 ]);
bl[30 ] = vaddq_s32(al[29 ], al[30 ]);
br[30 ] = vaddq_s32(ar[29 ], ar[30 ]);
bl[31 ] = vaddq_s32(al[28 ], al[31 ]);
br[31 ] = vaddq_s32(ar[28 ], ar[31 ]);
// Stage 6.
al[0 ] = bl[0 ];
ar[0 ] = br[0 ];
al[1 ] = bl[1 ];
ar[1 ] = br[1 ];
al[2 ] = bl[2 ];
ar[2 ] = br[2 ];
al[3 ] = bl[3 ];
ar[3 ] = br[3 ];
butterfly_two_coeff_s32(bl[7 ], br[7 ], bl[4 ], br[4 ], cospi_4_64, cospi_28_64,
&al[4 ], &ar[4 ], &al[7 ], &ar[7 ]);
butterfly_two_coeff_s32(bl[6 ], br[6 ], bl[5 ], br[5 ], cospi_20_64, cospi_12_64,
&al[5 ], &ar[5 ], &al[6 ], &ar[6 ]);
al[8 ] = vaddq_s32(bl[8 ], bl[9 ]);
ar[8 ] = vaddq_s32(br[8 ], br[9 ]);
al[9 ] = vsubq_s32(bl[8 ], bl[9 ]);
ar[9 ] = vsubq_s32(br[8 ], br[9 ]);
al[10 ] = vsubq_s32(bl[11 ], bl[10 ]);
ar[10 ] = vsubq_s32(br[11 ], br[10 ]);
al[11 ] = vaddq_s32(bl[11 ], bl[10 ]);
ar[11 ] = vaddq_s32(br[11 ], br[10 ]);
al[12 ] = vaddq_s32(bl[12 ], bl[13 ]);
ar[12 ] = vaddq_s32(br[12 ], br[13 ]);
al[13 ] = vsubq_s32(bl[12 ], bl[13 ]);
ar[13 ] = vsubq_s32(br[12 ], br[13 ]);
al[14 ] = vsubq_s32(bl[15 ], bl[14 ]);
ar[14 ] = vsubq_s32(br[15 ], br[14 ]);
al[15 ] = vaddq_s32(bl[15 ], bl[14 ]);
ar[15 ] = vaddq_s32(br[15 ], br[14 ]);
al[16 ] = bl[16 ];
ar[16 ] = br[16 ];
al[19 ] = bl[19 ];
ar[19 ] = br[19 ];
al[20 ] = bl[20 ];
ar[20 ] = br[20 ];
al[23 ] = bl[23 ];
ar[23 ] = br[23 ];
al[24 ] = bl[24 ];
ar[24 ] = br[24 ];
al[27 ] = bl[27 ];
ar[27 ] = br[27 ];
al[28 ] = bl[28 ];
ar[28 ] = br[28 ];
al[31 ] = bl[31 ];
ar[31 ] = br[31 ];
butterfly_two_coeff_s32(bl[30 ], br[30 ], bl[17 ], br[17 ], cospi_4_64,
cospi_28_64, &al[30 ], &ar[30 ], &al[le='color: green'>17 ], &ar[17 ]);
butterfly_two_coeff_s32(bl[29 ], br[29 ], bl[18 ], br[18 ], cospi_28_64,
-cospi_4_64, &al[29 ], &ar[29 ], &al[le='color: green'>18 ], &ar[18 ]);
butterfly_two_coeff_s32(bl[26 ], br[26 ], bl[21 ], br[21 ], cospi_20_64,
cospi_12_64, &al[26 ], &ar[26 ], &al[le='color: green'>21 ], &ar[21 ]);
butterfly_two_coeff_s32(bl[25 ], br[25 ], bl[22 ], br[22 ], cospi_12_64,
-cospi_20_64, &al[25 ], &ar[25 ], &al[yle='color: green'>22 ], &ar[22 ]);
// Stage 7.
bl[0 ] = al[0 ];
br[0 ] = ar[0 ];
bl[1 ] = al[1 ];
br[1 ] = ar[1 ];
bl[2 ] = al[2 ];
br[2 ] = ar[2 ];
bl[3 ] = al[3 ];
br[3 ] = ar[3 ];
bl[4 ] = al[4 ];
br[4 ] = ar[4 ];
bl[5 ] = al[5 ];
br[5 ] = ar[5 ];
bl[6 ] = al[6 ];
br[6 ] = ar[6 ];
bl[7 ] = al[7 ];
br[7 ] = ar[7 ];
butterfly_two_coeff_s32(al[15 ], ar[15 ], al[8 ], ar[8 ], cospi_2_64, cospi_30_64,
&bl[8 ], &br[8 ], &bl[15 ], &br[15 ]);
butterfly_two_coeff_s32(al[14 ], ar[14 ], al[9 ], ar[9 ], cospi_18_64,
cospi_14_64, &bl[9 ], &br[9 ], &bl[='color: green'>14 ], &br[14 ]);
butterfly_two_coeff_s32(al[13 ], ar[13 ], al[10 ], ar[10 ], cospi_10_64,
cospi_22_64, &bl[10 ], &br[10 ], &bl[le='color: green'>13 ], &br[13 ]);
butterfly_two_coeff_s32(al[12 ], ar[12 ], al[11 ], ar[11 ], cospi_26_64,
cospi_6_64, &bl[11 ], &br[11 ], &bl[e='color: green'>12 ], &br[12 ]);
bl[16 ] = vaddq_s32(al[16 ], al[17 ]);
br[16 ] = vaddq_s32(ar[16 ], ar[17 ]);
bl[17 ] = vsubq_s32(al[16 ], al[17 ]);
br[17 ] = vsubq_s32(ar[16 ], ar[17 ]);
bl[18 ] = vsubq_s32(al[19 ], al[18 ]);
br[18 ] = vsubq_s32(ar[19 ], ar[18 ]);
bl[19 ] = vaddq_s32(al[19 ], al[18 ]);
br[19 ] = vaddq_s32(ar[19 ], ar[18 ]);
bl[20 ] = vaddq_s32(al[20 ], al[21 ]);
br[20 ] = vaddq_s32(ar[20 ], ar[21 ]);
bl[21 ] = vsubq_s32(al[20 ], al[21 ]);
br[21 ] = vsubq_s32(ar[20 ], ar[21 ]);
bl[22 ] = vsubq_s32(al[23 ], al[22 ]);
br[22 ] = vsubq_s32(ar[23 ], ar[22 ]);
bl[23 ] = vaddq_s32(al[23 ], al[22 ]);
br[23 ] = vaddq_s32(ar[23 ], ar[22 ]);
bl[24 ] = vaddq_s32(al[24 ], al[25 ]);
br[24 ] = vaddq_s32(ar[24 ], ar[25 ]);
bl[25 ] = vsubq_s32(al[24 ], al[25 ]);
br[25 ] = vsubq_s32(ar[24 ], ar[25 ]);
bl[26 ] = vsubq_s32(al[27 ], al[26 ]);
br[26 ] = vsubq_s32(ar[27 ], ar[26 ]);
bl[27 ] = vaddq_s32(al[27 ], al[26 ]);
br[27 ] = vaddq_s32(ar[27 ], ar[26 ]);
bl[28 ] = vaddq_s32(al[28 ], al[29 ]);
br[28 ] = vaddq_s32(ar[28 ], ar[29 ]);
bl[29 ] = vsubq_s32(al[28 ], al[29 ]);
br[29 ] = vsubq_s32(ar[28 ], ar[29 ]);
bl[30 ] = vsubq_s32(al[31 ], al[30 ]);
br[30 ] = vsubq_s32(ar[31 ], ar[30 ]);
bl[31 ] = vaddq_s32(al[31 ], al[30 ]);
br[31 ] = vaddq_s32(ar[31 ], ar[30 ]);
// Final stage.
left[0 ] = bl[0 ];
right[0 ] = br[0 ];
left[16 ] = bl[1 ];
right[16 ] = br[1 ];
left[8 ] = bl[2 ];
right[8 ] = br[2 ];
left[24 ] = bl[3 ];
right[24 ] = br[3 ];
left[4 ] = bl[4 ];
right[4 ] = br[4 ];
left[20 ] = bl[5 ];
right[20 ] = br[5 ];
left[12 ] = bl[6 ];
right[12 ] = br[6 ];
left[28 ] = bl[7 ];
right[28 ] = br[7 ];
left[2 ] = bl[8 ];
right[2 ] = br[8 ];
left[18 ] = bl[9 ];
right[18 ] = br[9 ];
left[10 ] = bl[10 ];
right[10 ] = br[10 ];
left[26 ] = bl[11 ];
right[26 ] = br[11 ];
left[6 ] = bl[12 ];
right[6 ] = br[12 ];
left[22 ] = bl[13 ];
right[22 ] = br[13 ];
left[14 ] = bl[14 ];
right[14 ] = br[14 ];
left[30 ] = bl[15 ];
right[30 ] = br[15 ];
butterfly_two_coeff_s32(bl[31 ], br[31 ], bl[16 ], br[16 ], cospi_1_64,
cospi_31_64, &al[1 ], &ar[1 ], &al[='color: green'>31 ], &ar[31 ]);
left[1 ] = al[1 ];
right[1 ] = ar[1 ];
left[31 ] = al[31 ];
right[31 ] = ar[31 ];
butterfly_two_coeff_s32(bl[30 ], br[30 ], bl[17 ], br[17 ], cospi_17_64,
cospi_15_64, &al[17 ], &ar[17 ], &al[le='color: green'>15 ], &ar[15 ]);
left[17 ] = al[17 ];
right[17 ] = ar[17 ];
left[15 ] = al[15 ];
right[15 ] = ar[15 ];
butterfly_two_coeff_s32(bl[29 ], br[29 ], bl[18 ], br[18 ], cospi_9_64,
cospi_23_64, &al[9 ], &ar[9 ], &al[='color: green'>23 ], &ar[23 ]);
left[9 ] = al[9 ];
right[9 ] = ar[9 ];
left[23 ] = al[23 ];
right[23 ] = ar[23 ];
butterfly_two_coeff_s32(bl[28 ], br[28 ], bl[19 ], br[19 ], cospi_25_64,
cospi_7_64, &al[25 ], &ar[25 ], &al[e='color: green'>7 ], &ar[7 ]);
left[25 ] = al[25 ];
right[25 ] = ar[25 ];
left[7 ] = al[7 ];
right[7 ] = ar[7 ];
butterfly_two_coeff_s32(bl[27 ], br[27 ], bl[20 ], br[20 ], cospi_5_64,
cospi_27_64, &al[5 ], &ar[5 ], &al[='color: green'>27 ], &ar[27 ]);
left[5 ] = al[5 ];
right[5 ] = ar[5 ];
left[27 ] = al[27 ];
right[27 ] = ar[27 ];
butterfly_two_coeff_s32(bl[26 ], br[26 ], bl[21 ], br[21 ], cospi_21_64,
cospi_11_64, &al[21 ], &ar[21 ], &al[le='color: green'>11 ], &ar[11 ]);
left[21 ] = al[21 ];
right[21 ] = ar[21 ];
left[11 ] = al[11 ];
right[11 ] = ar[11 ];
butterfly_two_coeff_s32(bl[25 ], br[25 ], bl[22 ], br[22 ], cospi_13_64,
cospi_19_64, &al[13 ], &ar[13 ], &al[le='color: green'>19 ], &ar[19 ]);
left[13 ] = al[13 ];
right[13 ] = ar[13 ];
left[19 ] = al[19 ];
right[19 ] = ar[19 ];
butterfly_two_coeff_s32(bl[24 ], br[24 ], bl[23 ], br[23 ], cospi_29_64,
cospi_3_64, &al[29 ], &ar[29 ], &al[e='color: green'>3 ], &ar[3 ]);
left[29 ] = al[29 ];
right[29 ] = ar[29 ];
left[3 ] = al[3 ];
right[3 ] = ar[3 ];
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
Messung V0.5 in Prozent C=91 H=87 G=88
¤ Dauer der Verarbeitung: 0.50 Sekunden
(vorverarbeitet am 2026-06-07)
¤
*© Formatika GbR, Deutschland