695 void 696 row_callback(png_structp png_ptr, png_bytep new_row, 697 png_uint_32 row_num, int pass) 698 { : : 793 case gfxIFormats::RGB_A1: 794 { 795 for (PRUint32 x=iwidth; x>0; --x) { 796 *cptr32++ = GFX_PACKED_PIXEL(line[3]?0xFF:0x00, line[0], line[1], line[2]); 797 if (line[3] == 0) 798 rowHasNoAlpha = PR_FALSE; 799 line += 4; 800 } 801 } 802 break; 803 case gfxIFormats::RGB_A8: 804 { 805 for (PRUint32 x=width; x>0; --x) { 806 *cptr32++ = GFX_PACKED_PIXEL(line[3], line[0], line[1], line[2]); 807 if (line[3] != 0xff) 808 rowHasNoAlpha = PR_FALSE; 809 line += 4; 810 } 811 }
I have a question. What implementation of GFX_PACKED_PIXEL macro?
In /gfx/thebes/public/gfxColor.h118 /** 119 * Fast approximate division by 255. It has the property that 120 * for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255. 121 * But it only uses two adds and two shifts instead of an 122 * integer division (which is expensive on many processors). 123 * 124 * equivalent to ((v)/255) 125 */ 126 #define GFX_DIVIDE_BY_255(v) \ 127 (((((unsigned)(v)) << 8) + ((unsigned)(v)) + 255) >> 16) 128 129 /** 130 * Fast premultiply macro 131 * 132 * equivalent to (((c)*(a))/255) 133 */ 134 #define GFX_PREMULTIPLY(c,a) GFX_DIVIDE_BY_255((c)*(a)) 135 136 /** 137 * Macro to pack the 4 8-bit channels (A,R,G,B) 138 * into a 32-bit packed premultiplied pixel. 139 * 140 * The checks for 0 alpha or max alpha ensure that the 141 * compiler selects the quicked calculation when alpha is constant. 142 */ 143 #define GFX_PACKED_PIXEL(a,r,g,b) \ 144 ((a) == 0x00) ? 0x00000000 : \ 145 ((a) == 0xFF) ? ((0xFF << 24) | ((r) << 16) | ((g) << 8) | (b)) \ 146 : ((a) << 24) | \ 147 (GFX_PREMULTIPLY(r,a) << 16) | \ 148 (GFX_PREMULTIPLY(g,a) << 8) | \ 149 (GFX_PREMULTIPLY(b,a))
So, maybe alpha brended PNG image is too slow.
If it is implemented using SSE2 (this is incomplated code), I believe that this code maybe 200%-300% faster than original code?
xmm0080 = _mm_set1_epi16(0x0080); xmm0101 = _mm_set1_epi16(0x0101); xmmAlpha = _mm_set_epi32(0x00ff0000, 0x00000000, 0x00ff0000, 0x00000000); data = _mm_loadu_si128((__m128i*)src); dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); alphaLo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3)); alphaHi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3)); alphaLo = _mm_shufflehi_epi16 (alphaLo, _MM_SHUFFLE(3, 3, 3, 3)); alphaHi = _mm_shufflehi_epi16 (alphaHi, _MM_SHUFFLE(3, 3, 3, 3)); alphaLo = _mm_or_si128(alphaLo, xmmAlpha); alphaHi = _mm_or_si128(alphaHi, xmmAlpha); dataLo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2)); dataLo = _mm_shufflehi_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2)); dataHi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2)); dataHi = _mm_shufflehi_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2)); dataLo = _mm_mullo_epi16(dataLo, alphaLo); dataHi = _mm_mullo_epi16(dataHi, alphaHi); dataLo = _mm_adds_epu16(dataLo, xmm0080); dataHi = _mm_adds_epu16(dataHi, xmm0080); dataLo = _mm_mulhi_epu16(dataLo, xmm0101); dataHi = _mm_mulhi_epu16(dataHi, xmm0101); data = _mm_packus_epi16 (dataLo, dataHi); _mm_storeu_si128((__m128i*)dst, data);
No comments:
Post a Comment