// SkBitmapProcState optimized Shader, Sample, or Matrix procs. // // Only S32_alpha_D32_filter_DX exploits instructions beyond // our common baseline SSE2/NEON instruction sets, so that's // all that lives here. // // The rest are scattershot at the moment but I want to get them // all migrated to be normal code inside SkBitmapProcState.cpp.
// interpolate_in_x() is the crux of the SSSE3 implementation, // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16(). auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
uint32_t B0, uint32_t B1,
__m128i interlaced_x_weights) { // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp. // // It takes two arguments interlaced byte-wise: // - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...] // - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...] // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ]. // // That's why we go to all this trouble to make interlaced_x_weights, // and here we're about to interlace A0 with A1 and B0 with B1 to match. // // Our interlaced_x_weights are all in [0,16], and so we need not worry about // the signedness of that input nor about the signedness of the output.
// Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B. // Returns two pixels, with each color channel in a 16-bit lane of the __m128i. auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
uint32_t A2, uint32_t A3,
uint32_t B0, uint32_t B1,
uint32_t B2, uint32_t B3,
__m128i interlaced_x_weights, int wy) { // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
__m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
// Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy // as 16*top + (bot-top)*wy to save a multiply.
__m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
_mm_mullo_epi16(_mm_sub_epi16(bot, top),
_mm_set1_epi16(wy)));
// Scale down by total max weight 16x16 = 256.
px = _mm_srli_epi16(px, 8);
// Scale by alpha if needed. if (s.fAlphaScale < 256) {
px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
} return px;
};
// We're in _DX mode here, so we're only varying in X. // That means the first entry of xy is our constant pair of Y coordinates and weight in Y. // All the other entries in xy will be pairs of X coordinates and the X weight. int y0, y1, wy;
decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
// Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1, // and sixteen minus that as wl for pixels on the left at x0.
__m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
// We need to interlace wl and wr for _mm_maddubs_epi16().
__m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
enum { A,B,C,D };
// interpolate_in_x_and_y() can produce two output pixels (A and B) at a time // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
__m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
row1[x0[A]], row1[x1[A]],
row0[x0[B]], row0[x1[B]],
row1[x0[B]], row1[x1[B]],
interlaced_x_weights_AB, wy);
// Once more with the other half of the x-weights for two more pixels C,D.
__m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
row1[x0[C]], row1[x1[C]],
row0[x0[D]], row0[x1[D]],
row1[x0[D]], row1[x1[D]],
interlaced_x_weights_CD, wy);
// Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
_mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
xy += 4;
colors += 4;
count -= 4;
}
while (count --> 0) { // This is exactly the same flow as the count >= 4 loop above, but writing one pixel. int x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
// As above, splat out wx four times as wr, and sixteen minus that as wl.
__m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.
wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
// We'll put one pixel in the low 4 16-bit lanes to line up with wy, // and another in the upper 4 16-bit lanes to line up with 16 - wy. const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here.
_mm_set1_epi16(16-wy)); // Top pixel goes here.
while (count --> 0) { int x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
// We want to calculate a sum of 4 pixels weighted in two directions: // // sum = tl * (16-wy) * (16-wx) // + bl * ( wy) * (16-wx) // + tr * (16-wy) * ( wx) // + br * ( wy) * ( wx) // // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) // // We've already prepared allY as a vector containing [wy, 16-wy] as a way // to apply those y-direction weights. So we'll start on the x-direction // first, grouping into left and right halves, lined up with allY: // // L = [bl, tl] // R = [br, tr] // // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) // // Rewriting that one more step, we can replace a multiply with a shift: // // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) // // That's how we'll actually do this math.
__m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
// sum = horizontalSum( ... )
__m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
// Get back to [0,255] by dividing by maximum weight 16x16 = 256.
sum = _mm_srli_epi16(sum, 8);
if (s.fAlphaScale < 256) { // Scale by alpha, which is in [0,256].
sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
sum = _mm_srli_epi16(sum, 8);
}
// Pack back into 8-bit values and store.
*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
}
}
// We'll put one pixel in the low 16 16-bit lanes to line up with wy, // and another in the upper 16 16-bit lanes to line up with 16 - wy.
__m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));
while (count --> 0) { int x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
// Load the 4 pixels we're interpolating, in this grid: // | tl tr | // | bl br |
// We want to calculate a sum of 8 pixels weighted in two directions: // // sum = tl * (16-wy) * (16-wx) // + bl * ( wy) * (16-wx) // + tr * (16-wy) * ( wx) // + br * ( wy) * ( wx) // // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) // // We've already prepared allY as a vector containing [wy, 16-wy] as a way // to apply those y-direction weights. So we'll start on the x-direction // first, grouping into left and right halves, lined up with allY: // // L = [bl, tl] // R = [br, tr] // // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) // // Rewriting that one more step, we can replace a multiply with a shift: // // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) // // That's how we'll actually do this math.
__m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));
// sum = horizontalSum( ... )
__m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));
// Get back to [0,255] by dividing by maximum weight 16x16 = 256.
sum = __lasx_xvsrli_h(sum, 8);
if (s.fAlphaScale < 256) { // Scale by alpha, which is in [0,256].
sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));
sum = __lasx_xvsrli_h(sum, 8);
}
// Pack back into 8-bit values and store.
*colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
__lasx_xvsat_hu(sum, 8)), 0);
}
}
// We'll put one pixel in the low 8 16-bit lanes to line up with wy, // and another in the upper 8 16-bit lanes to line up with 16 - wy.
__m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));
while (count --> 0) { int x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
// We want to calculate a sum of 8 pixels weighted in two directions: // // sum = tl * (16-wy) * (16-wx) // + bl * ( wy) * (16-wx) // + tr * (16-wy) * ( wx) // + br * ( wy) * ( wx) // // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) // // We've already prepared allY as a vector containing [wy, 16-wy] as a way // to apply those y-direction weights. So we'll start on the x-direction // first, grouping into left and right halves, lined up with allY: // // L = [bl, tl] // R = [br, tr] // // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) // // Rewriting that one more step, we can replace a multiply with a shift: // // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) // // That's how we'll actually do this math.
__m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));
// sum = horizontalSum( ... )
__m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));
// Get back to [0,255] by dividing by maximum weight 16x16 = 256.
sum = __lsx_vsrli_h(sum, 8);
if (s.fAlphaScale < 256) { // Scale by alpha, which is in [0,256].
sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));
sum = __lsx_vsrli_h(sum, 8);
}
// Pack back into 8-bit values and store.
*colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
__lsx_vsat_hu(sum, 8)), 0);
}
}
#else
// The NEON code only actually differs from the portable code in the // filtering step after we've loaded all four pixels we want to bilerp.
vy = vdup_n_u8(y); // duplicate y into vy
vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
va0 = vdup_n_u32(a00); // duplicate a00
va1 = vdup_n_u32(a10); // duplicate a10
va0 = vset_lane_u32(a01, va0, 1); // set top to a01
va1 = vset_lane_u32(a11, va1, 1); // set top to a11
vx = vdup_n_u16(x); // duplicate x into vx
vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.