/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "vis_proto.h"
/***************************************************************/
typedef int t_s32;
typedef unsigned int t_u32;
#if defined(__sparcv9)
typedef long t_s64;
typedef unsigned long t_u64;
#else
typedef long long t_s64;
typedef unsigned long long t_u64;
#endif
typedef double t_d64;
/***************************************************************/
typedef union {
t_d64 d64;
struct {
t_s32 i0;
t_s32 i1;
} i32s;
} d64_2_i32;
/***************************************************************/
#define BUFF_SIZE
256
#define A_BITS
19
#define A_MASK ((
1 << A_BITS) -
1)
/***************************************************************/
static t_u64 mask_cnst[] = {
0x8000000080000000ull
};
/***************************************************************/
#define DEF_VARS(N) \
t_d64 *py = (t_d64 *)y; \
t_d64 mask = *((t_d64 *)mask_cnst); \
t_d64 ca = (
1u <<
31) -
1; \
t_d64 da = (t_d64)a; \
t_s64 buff[N], s; \
d64_2_i32 dy
/***************************************************************/
#define MUL_U32_S64_2(i) \
dy.d64 = vis_fxnor(mask, py[i]); \
buff[
2 * (i)] = (ca - (t_d64)dy.i32s.i0) * da; \
buff[
2 * (i) +
1] = (ca - (t_d64)dy.i32s.i1) * da
#define MUL_U32_S64_2_D(i) \
dy.d64 = vis_fxnor(mask, py[i]); \
d0 = ca - (t_d64)dy.i32s.i0; \
d1 = ca - (t_d64)dy.i32s.i1; \
buff[
4 * (i)] = (t_s64)(d0 * da); \
buff[
4 * (i) +
1] = (t_s64)(d0 * db); \
buff[
4 * (i) +
2] = (t_s64)(d1 * da); \
buff[
4 * (i) +
3] = (t_s64)(d1 * db)
/***************************************************************/
#define ADD_S64_U32(i) \
s = buff[i] + x[i] + c; \
z[i] = s; \
c = (s >>
32)
#define ADD_S64_U32_D(i) \
s = buff[
2 * (i)] + (((t_s64)(buff[
2 * (i) +
1])) << A_BITS) + x[i] + uc; \
z[i] = s; \
uc = ((t_u64)s >>
32)
/***************************************************************/
#define MUL_U32_S64_8(i) \
MUL_U32_S64_2(i); \
MUL_U32_S64_2(i +
1); \
MUL_U32_S64_2(i +
2); \
MUL_U32_S64_2(i +
3)
#define MUL_U32_S64_D_8(i) \
MUL_U32_S64_2_D(i); \
MUL_U32_S64_2_D(i +
1); \
MUL_U32_S64_2_D(i +
2); \
MUL_U32_S64_2_D(i +
3)
/***************************************************************/
#define ADD_S64_U32_8(i) \
ADD_S64_U32(i); \
ADD_S64_U32(i +
1); \
ADD_S64_U32(i +
2); \
ADD_S64_U32(i +
3); \
ADD_S64_U32(i +
4); \
ADD_S64_U32(i +
5); \
ADD_S64_U32(i +
6); \
ADD_S64_U32(i +
7)
#define ADD_S64_U32_D_8(i) \
ADD_S64_U32_D(i); \
ADD_S64_U32_D(i +
1); \
ADD_S64_U32_D(i +
2); \
ADD_S64_U32_D(i +
3); \
ADD_S64_U32_D(i +
4); \
ADD_S64_U32_D(i +
5); \
ADD_S64_U32_D(i +
6); \
ADD_S64_U32_D(i +
7)
/***************************************************************/
t_u32
mul_add(t_u32 *z, t_u32 *x, t_u32 *y,
int n, t_u32 a)
{
if (a < (
1 << A_BITS)) {
if (n ==
8) {
DEF_VARS(
8);
t_s32 c =
0;
MUL_U32_S64_8(
0);
ADD_S64_U32_8(
0);
return c;
}
else if (n ==
16) {
DEF_VARS(
16);
t_s32 c =
0;
MUL_U32_S64_8(
0);
MUL_U32_S64_8(
4);
ADD_S64_U32_8(
0);
ADD_S64_U32_8(
8);
return c;
}
else {
DEF_VARS(BUFF_SIZE);
t_s32 i, c =
0;
#pragma pipeloop(
0)
for (i =
0; i < (n +
1) /
2; i++) {
MUL_U32_S64_2(i);
}
#pragma pipeloop(
0)
for (i =
0; i < n; i++) {
ADD_S64_U32(i);
}
return c;
}
}
else {
if (n ==
8) {
DEF_VARS(
2 *
8);
t_d64 d0, d1, db;
t_u32 uc =
0;
da = (t_d64)(a & A_MASK);
db = (t_d64)(a >> A_BITS);
MUL_U32_S64_D_8(
0);
ADD_S64_U32_D_8(
0);
return uc;
}
else if (n ==
16) {
DEF_VARS(
2 *
16);
t_d64 d0, d1, db;
t_u32 uc =
0;
da = (t_d64)(a & A_MASK);
db = (t_d64)(a >> A_BITS);
MUL_U32_S64_D_8(
0);
MUL_U32_S64_D_8(
4);
ADD_S64_U32_D_8(
0);
ADD_S64_U32_D_8(
8);
return uc;
}
else {
DEF_VARS(
2 * BUFF_SIZE);
t_d64 d0, d1, db;
t_u32 i, uc =
0;
da = (t_d64)(a & A_MASK);
db = (t_d64)(a >> A_BITS);
#pragma pipeloop(
0)
for (i =
0; i < (n +
1) /
2; i++) {
MUL_U32_S64_2_D(i);
}
#pragma pipeloop(
0)
for (i =
0; i < n; i++) {
ADD_S64_U32_D(i);
}
return uc;
}
}
}
/***************************************************************/
t_u32
mul_add_inp(t_u32 *x, t_u32 *y,
int n, t_u32 a)
{
return mul_add(x, x, y, n, a);
}
/***************************************************************/