// This file is included in multiple translation units with different #defines set enabling // different instruction use for different CPU architectures. // // A pair of files controls what #defines are defined: SkOpts_SetTarget.h set the flags, and // SkOpts_RestoreTarget.h restores them. SkOpts_SetTarget is controlled by setting the // SK_OPTS_TARGET define before included it. // // SkOpts_SetTarget also sets the #define SK_OPTS_NS to the unique namespace for this code.
#if defined(__clang__) || defined(__GNUC__)
#define SI __attribute__((always_inline)) static inline
#else
#define SI static inline
#endif
SI float reciprocal_alpha_times_255_portable(float a) {
return a != 0 ? 255.0f / a : 0.0f;
}
SI float reciprocal_alpha_portable(float a) {
return a != 0 ? 1.0f / a : 0.0f;
}
#if defined(SK_ARM_HAS_NEON) // -- NEON -- Harden against timing attacks // For neon, the portable versions create branchless code.
SI float reciprocal_alpha_times_255(float a) {
return reciprocal_alpha_times_255_portable(a);
}
SI float reciprocal_alpha(float a) {
return reciprocal_alpha_portable(a);
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 && (defined(__clang__) || !defined(_MSC_VER)) // -- SSE -- Harden against timing attacks -- MSVC is not supported.
using F4 = __m128;
SK_NO_SANITIZE("float-divide-by-zero")
SI float reciprocal_alpha_times_255(float a) {
SkASSERT(0 <= a && a <= 255);
F4 vA{a, a, a, a};
auto q = F4{255.0f} / vA;
return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
}
SK_NO_SANITIZE("float-divide-by-zero")
SI float reciprocal_alpha(float a) {
SkASSERT(0 <= a && a <= 1);
F4 vA{a, a, a, a};
auto q = F4{1.0f} / vA;
return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
}
#else // -- Portable -- *Not* hardened against timing attacks
SI float reciprocal_alpha_times_255(float a) {
return reciprocal_alpha_times_255_portable(a);
}
SI float reciprocal_alpha(float a) {
return reciprocal_alpha_portable(a);
}
#endif
static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) { for (int i = 0; i < count; i++) {
uint8_t a = (src[i] >> 24) & 0xFF,
b = (src[i] >> 16) & 0xFF,
g = (src[i] >> 8) & 0xFF,
r = (src[i] >> 0) & 0xFF;
b = (b*a+127)/255;
g = (g*a+127)/255;
r = (r*a+127)/255;
dst[i] = (uint32_t)a << 24
| (uint32_t)b << 16
| (uint32_t)g << 8
| (uint32_t)r << 0;
}
}
// RP uses the following rounding routines in store_8888. There are three different // styles of rounding: // 1) +0.5 and floor - used by scalar and ARMv7 // 2) round to even for sure - ARMv8 // 3) round to even maybe - intel. The rounding on intel depends on MXCSR which // defaults to round to even. // // Note: that vrndns_f32 is the single float version of vcvtnq_u32_f32.
// Doing the math for an original color b resulting in a premul color x, // x = ⌊(b * a + 127) / 255⌋, // x ≤ (b * a + 127) / 255 < x + 1, // 255 * x ≤ b * a + 127 < 255 * (x + 1), // 255 * x - 127 ≤ b * a < 255 * (x + 1) - 127, // 255 * x - 127 ≤ b * a < 255 * x + 128, // (255 * x - 127) / a ≤ b < (255 * x + 128) / a. // So, given a premul value x < a, the original color b can be in the above range. // We can pick the middle of that range as // b = 255 * x / a // b = x * (255 / a)
SI uint32_t unpremul_quick(float reciprocalA, float c) {
return (uint32_t)std::min(255.0f, (c * reciprocalA + 0.5f));
}
// Similar to unpremul but simulates Raster Pipeline by normalizing the pixel on the interval // [0, 1] and uses round-to-even in most cases instead of round-up.
SI uint32_t unpremul_simulating_RP(float reciprocalA, float c) { const float normalizedC = c * (1.0f / 255.0f); const float answer = std::min(255.0f, normalizedC * reciprocalA * 255.0f);
return pixel_round_as_RP(answer);
}
SI uint32_t rgbA_to_CCCA(float c00, float c08, float c16, float a) { if constexpr (kFastUnpremul) { const float reciprocalA = reciprocal_alpha_times_255(a);
auto unpremul = [reciprocalA](float c) -> uint32_t {
return unpremul_quick(reciprocalA, c);
};
return (uint32_t) a << 24
| unpremul(c16) << 16
| unpremul(c08) << 8
| unpremul(c00) << 0;
} else { const float normalizedA = a * (1.0f / 255.0f); const float reciprocalA = reciprocal_alpha(normalizedA);
auto unpremul = [reciprocalA](float c) -> uint32_t {
return unpremul_simulating_RP(reciprocalA, c);
};
return (uint32_t) a << 24
| unpremul(c16) << 16
| unpremul(c08) << 8
| unpremul(c00) << 0;
}
}
static void rgbA_to_RGBA_portable(uint32_t* dst, const uint32_t* src, int count) { for (int i = 0; i < count; i++) { const uint32_t p = src[i];
const float a = (p >> 24) & 0xFF,
b = (p >> 16) & 0xFF,
g = (p >> 8) & 0xFF,
r = (p >> 0) & 0xFF;
dst[i] = rgbA_to_CCCA(r, g, b, a);
}
}
static void rgbA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) { for (int i = 0; i < count; i++) { const uint32_t p = src[i];
const uint32_t a = (p >> 24) & 0xFF,
b = (p >> 16) & 0xFF,
g = (p >> 8) & 0xFF,
r = (p >> 0) & 0xFF;
dst[i] = rgbA_to_CCCA(b, g, r, a);
}
}
static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) { for (int i = 0; i < count; i++) {
uint8_t a = (src[i] >> 24) & 0xFF,
b = (src[i] >> 16) & 0xFF,
g = (src[i] >> 8) & 0xFF,
r = (src[i] >> 0) & 0xFF;
b = (b*a+127)/255;
g = (g*a+127)/255;
r = (r*a+127)/255;
dst[i] = (uint32_t)a << 24
| (uint32_t)r << 16
| (uint32_t)g << 8
| (uint32_t)b << 0;
}
}
static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) { for (int i = 0; i < count; i++) {
uint8_t a = (src[i] >> 24) & 0xFF,
b = (src[i] >> 16) & 0xFF,
g = (src[i] >> 8) & 0xFF,
r = (src[i] >> 0) & 0xFF;
dst[i] = (uint32_t)a << 24
| (uint32_t)r << 16
| (uint32_t)g << 8
| (uint32_t)b << 0;
}
}
static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) { for (int i = 0; i < count; i++) {
uint8_t g = src[0],
a = src[1];
src += 2;
dst[i] = (uint32_t)a << 24
| (uint32_t)g << 16
| (uint32_t)g << 8
| (uint32_t)g << 0;
}
}
static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) { for (int i = 0; i < count; i++) {
uint8_t g = src[0],
a = src[1];
src += 2;
g = (g*a+127)/255;
dst[i] = (uint32_t)a << 24
| (uint32_t)g << 16
| (uint32_t)g << 8
| (uint32_t)g << 0;
}
}
static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) { for (int i = 0; i < count; i++) {
uint8_t k = (src[i] >> 24) & 0xFF,
y = (src[i] >> 16) & 0xFF,
m = (src[i] >> 8) & 0xFF,
c = (src[i] >> 0) & 0xFF; // See comments in SkSwizzler.cpp for details on the conversion formula.
uint8_t b = (y*k+127)/255,
g = (m*k+127)/255,
r = (c*k+127)/255;
dst[i] = (uint32_t)0xFF << 24
| (uint32_t) b << 16
| (uint32_t) g << 8
| (uint32_t) r << 0;
}
}
static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) { for (int i = 0; i < count; i++) {
uint8_t k = (src[i] >> 24) & 0xFF,
y = (src[i] >> 16) & 0xFF,
m = (src[i] >> 8) & 0xFF,
c = (src[i] >> 0) & 0xFF;
uint8_t b = (y*k+127)/255,
g = (m*k+127)/255,
r = (c*k+127)/255;
dst[i] = (uint32_t)0xFF << 24
| (uint32_t) r << 16
| (uint32_t) g << 8
| (uint32_t) b << 0;
}
}
#if defined(SK_ARM_HAS_NEON) // -- NEON ----------------------------------------------------------------------------------------- // Rounded divide by 255, (x + 127) / 255
SI uint8x8_t div255_round(uint16x8_t x) { // result = (x + 127) / 255 // result = (x + 127) / 256 + error1 // // error1 = (x + 127) / (255 * 256) // error1 = (x + 127) / (256 * 256) + error2 // // error2 = (x + 127) / (255 * 256 * 256) // // The maximum value of error2 is too small to matter. Thus: // result = (x + 127) / 256 + (x + 127) / (256 * 256) // result = ((x + 127) / 256 + x + 127) / 256 // result = ((x + 127) >> 8 + x + 127) >> 8 // // Use >>> to represent "rounded right shift" which, conveniently, // NEON supports in one instruction. // result = ((x >>> 8) + x) >>> 8 // // Note that the second right shift is actually performed as an // "add, round, and narrow back to 8-bits" instruction.
return vraddhn_u16(x, vrshrq_n_u16(x, 8));
}
// Scale a byte by another, (x * y + 127) / 255
SI uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
return div255_round(vmull_u8(x, y));
}
// Call portable code to finish up the tail of [0,8) pixels.
auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
proc(dst, src, count);
}
static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) { while (count >= 16) { // Load 16 pixels.
uint8x16x2_t ga = vld2q_u8(src);
// Premultiply if requested. if (kPremul) {
ga.val[0] = vcombine_u8(
scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
}
// Set each of the color channels.
uint8x16x4_t rgba;
rgba.val[0] = ga.val[0];
rgba.val[1] = ga.val[0];
rgba.val[2] = ga.val[0];
rgba.val[3] = ga.val[1];
if (count >= 8) { // Load 8 pixels.
uint8x8x2_t ga = vld2_u8(src);
// Premultiply if requested. if (kPremul) {
ga.val[0] = scale(ga.val[0], ga.val[1]);
}
// Set each of the color channels.
uint8x8x4_t rgba;
rgba.val[0] = ga.val[0];
rgba.val[1] = ga.val[0];
rgba.val[2] = ga.val[0];
rgba.val[3] = ga.val[1];
// Only use the SIMD code if simulating RP, otherwise the quick code auto-vectorizes will // enough on ARM to not need a SIMD implementation. if constexpr (!kFastUnpremul) { while (count >= 8) { const uint8x8x4_t in = vld4_u8((const uint8_t*)src);
// Scale a byte by another. // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
static __m256i scale(__m256i x, __m256i y) { const __m256i _128 = _mm256_set1_epi16(128); const __m256i _257 = _mm256_set1_epi16(257);
// (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
}
if (count >= 8) {
__m256i lo = _mm256_loadu_si256((const __m256i*) src),
hi = _mm256_setzero_si256();
premul8(&lo, &hi);
_mm256_storeu_si256((__m256i*) dst, lo);
src += 8;
dst += 8;
count -= 8;
}
// Call portable code to finish up the tail of [0,8) pixels.
auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
proc(dst, src, count);
}
// Scale a byte by another. // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
static __m128i scale(__m128i x, __m128i y) { const __m128i _128 = _mm_set1_epi16(128); const __m128i _257 = _mm_set1_epi16(257);
// (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
}
if (count >= 4) {
__m128i lo = _mm_loadu_si128((const __m128i*) src),
hi = _mm_setzero_si128();
premul8(&lo, &hi);
_mm_storeu_si128((__m128i*) dst, lo);
src += 4;
dst += 4;
count -= 4;
}
// Call portable code to finish up the tail of [0,4) pixels.
auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
proc(dst, src, count);
}
// Scale a byte by another. // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. // (x+127)/255 == ((x+128)*257)>>16
SI __m256i scale(__m256i x, __m256i y) { const __m256i _128 = __lasx_xvreplgr2vr_h(128); const __m256i _257 = __lasx_xvreplgr2vr_h(257);
if (count >= 8) {
__m256i lo = __lasx_xvld(src, 0),
hi = __lasx_xvldi(0);
premul8(&lo, &hi);
__lasx_xvst(lo, dst, 0);
src += 8;
dst += 8;
count -= 8;
}
// Call portable code to finish up the tail of [0,4) pixels.
auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
proc(dst, src, count);
}
// Scale a byte by another. // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
SI __m128i scale(__m128i x, __m128i y) { const __m128i _128 = __lsx_vreplgr2vr_h(128); const __m128i _257 = __lsx_vreplgr2vr_h(257);
// Unpack to 16-bit planar.
__m128i r = __lsx_vilvl_b(zeros, rg), // r_r_r_r_ R_R_R_R_
g = __lsx_vilvh_b(zeros, rg), // g_g_g_g_ G_G_G_G_
b = __lsx_vilvl_b(zeros, ba), // b_b_b_b_ B_B_B_B_
a = __lsx_vilvh_b(zeros, ba); // a_a_a_a_ A_A_A_A_
// Premultiply!
r = scale(r, a);
g = scale(g, a);
b = scale(b, a);
// Repack into interlaced pixels.
rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)); // rgrgrgrg RGRGRGRG
ba = __lsx_vor_v(b, __lsx_vslli_h(a, 8)); // babababa BABABABA
*lo = __lsx_vilvl_h(ba, rg); // rgbargba rgbargba
*hi = __lsx_vilvh_h(ba, rg); // RGBARGBA RGBARGBA
}; while (count >= 8) {
__m128i lo = __lsx_vld(src ,0),
hi = __lsx_vld(src ,16);
premul8(&lo, &hi);
__lsx_vst(lo, dst, 0);
__lsx_vst(hi, dst, 16);
src += 8;
dst += 8;
count -= 8;
}
if (count >= 4) {
__m128i lo = __lsx_vld(src, 0),
hi = __lsx_vldi(0);
premul8(&lo, &hi);
__lsx_vst(lo, dst, 0);
src += 4;
dst += 4;
count -= 4;
}
// Call portable code to finish up the tail of [0,4) pixels.
auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
proc(dst, src, count);
}
// Call portable code to finish up the tail of [0,8) pixels.
auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
proc(dst, src, count);
}
void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
insert_alpha_should_swaprb(false, dst, src, count);
}
void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
insert_alpha_should_swaprb(true, dst, src, count);
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
static void insert_alpha_should_swaprb(bool kSwapRB,
uint32_t dst[], const uint8_t* src, int count) { const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
__m128i expand; const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant. if (kSwapRB) {
expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
} else {
expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
}
while (count >= 6) { // Load a vector. While this actually contains 5 pixels plus an // extra component, we will discard all but the first four pixels on // this iteration.
__m128i rgb = _mm_loadu_si128((const __m128i*) src);
// Expand the first four pixels to RGBX and then mask to RGB(FF).
__m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
// Store 4 pixels.
_mm_storeu_si128((__m128i*) dst, rgba);
src += 4*3;
dst += 4;
count -= 4;
}
// Call portable code to finish up the tail of [0,4) pixels.
auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
proc(dst, src, count);
}
while (count >= 8) { // Load a vector. While this actually contains 5 pixels plus an // extra component, we will discard all but the first four pixels on // this iteration.
__m256i rgb = __lasx_xvld(src, 0);
__m256i rgb_l = __lasx_xvpermi_d(rgb, 0x44);
__m256i rgb_h = __lasx_xvpermi_d(rgb, 0xEE);
// Expand the first four pixels to RGBX and then mask to RGB(FF).
__m256i rgba = __lasx_xvor_v(__lasx_xvshuf_b(rgb_h, rgb_l, expand), alphaMask);
// Store 8 pixels.
__lasx_xvst(rgba, dst, 0);
src += 4*6;
dst += 8;
count -= 8;
}
// Call portable code to finish up the tail of [0,4) pixels.
auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
proc(dst, src, count);
}
/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
insert_alpha_should_swaprb(false, dst, src, count);
}
/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
insert_alpha_should_swaprb(true, dst, src, count);
}
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
static void insert_alpha_should_swaprb(bool kSwapRB,
uint32_t dst[], const uint8_t* src, int count) { const __m128i alphaMask = __lsx_vreplgr2vr_w(0xFF000000);
while (count >= 6) { // Load a vector. While this actually contains 5 pixels plus an // extra component, we will discard all but the first four pixels on // this iteration.
__m128i rgb = __lsx_vld(src, 0);
// Expand the first four pixels to RGBX and then mask to RGB(FF).
__m128i rgba = __lsx_vor_v(__lsx_vshuf_b(rgb, rgb, expand), alphaMask);
// Store 4 pixels.
__lsx_vst(rgba, dst, 0);
src += 4*3;
dst += 4;
count -= 4;
}
// Call portable code to finish up the tail of [0,4) pixels.
auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
proc(dst, src, count);
}
/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
insert_alpha_should_swaprb(false, dst, src, count);
}
/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
insert_alpha_should_swaprb(true, dst, src, count);
}
#else
void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
RGB_to_RGB1_portable(dst, src, count);
}
void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
RGB_to_BGR1_portable(dst, src, count);
}
#endif
} // namespace SK_OPTS_NS
#undef SI
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.37 Sekunden
(vorverarbeitet am 2026-04-28)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.