SSSE3 rgba->gray8 convert function now uses less instructions and is 10-20% faster

This commit is contained in:
Kfir Itzhak 2017-03-14 16:54:00 +02:00
parent 844b4bb5ee
commit 20604c2150
1 changed files with 12 additions and 25 deletions

View File

@ -45,7 +45,6 @@ static short *r_v_table;
static short *g_v_table; static short *g_v_table;
static short *g_u_table; static short *g_u_table;
static short *b_u_table; static short *b_u_table;
__attribute__((aligned(16))) static const uint8_t movemask[16] = {0,4,8,12,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
jpeg_compress_struct *Image::writejpg_ccinfo[101] = { 0 }; jpeg_compress_struct *Image::writejpg_ccinfo[101] = { 0 };
jpeg_compress_struct *Image::encodejpg_ccinfo[101] = { 0 }; jpeg_compress_struct *Image::encodejpg_ccinfo[101] = { 0 };
@ -4180,38 +4179,26 @@ void ssse3_convert_rgba_gray8(const uint8_t* col1, uint8_t* result, unsigned lon
"mov $0x1F1F1F1F, %%eax\n\t" "mov $0x1F1F1F1F, %%eax\n\t"
"movd %%eax, %%xmm4\n\t" "movd %%eax, %%xmm4\n\t"
"pshufd $0x0, %%xmm4, %%xmm4\n\t" "pshufd $0x0, %%xmm4, %%xmm4\n\t"
"mov $0xff, %%eax\n\t" "mov $0x00010502, %%eax\n\t"
"movd %%eax, %%xmm0\n\t" "movd %%eax, %%xmm3\n\t"
"pshufd $0x0, %%xmm0, %%xmm0\n\t" "pshufd $0x0, %%xmm3, %%xmm3\n\t"
"movdqa %3, %%xmm5\n\t" "pxor %%xmm0, %%xmm0\n\t"
"sub $0x10, %0\n\t" "sub $0x10, %0\n\t"
"sub $0x4, %1\n\t" "sub $0x4, %1\n\t"
"ssse3_convert_rgba_gray8_iter:\n\t" "ssse3_convert_rgba_gray8_iter:\n\t"
"movdqa (%0,%2,4), %%xmm3\n\t" "movdqa (%0,%2,4), %%xmm1\n\t"
"psrlq $0x3, %%xmm3\n\t" "psrlq $0x3, %%xmm1\n\t"
"pand %%xmm4, %%xmm3\n\t" "pand %%xmm4, %%xmm1\n\t"
"movdqa %%xmm3, %%xmm2\n\t" "pmaddubsw %%xmm3, %%xmm1\n\t"
"psrld $0x8, %%xmm2\n\t" "phaddw %%xmm0, %%xmm1\n\t"
"pand %%xmm0, %%xmm2\n\t" "packuswb %%xmm1, %%xmm1\n\t"
"movdqa %%xmm2, %%xmm1\n\t"
"pslld $0x2, %%xmm2\n\t"
"paddd %%xmm1, %%xmm2\n\t"
"movdqa %%xmm3, %%xmm1\n\t"
"pand %%xmm0, %%xmm1\n\t"
"paddd %%xmm1, %%xmm1\n\t"
"paddd %%xmm2, %%xmm1\n\t"
"movdqa %%xmm3, %%xmm2\n\t"
"psrld $0x10, %%xmm2\n\t"
"pand %%xmm0, %%xmm2\n\t"
"paddd %%xmm2, %%xmm1\n\t"
"pshufb %%xmm5, %%xmm1\n\t"
"movd %%xmm1, %%eax\n\t" "movd %%xmm1, %%eax\n\t"
"movnti %%eax, (%1,%2)\n\t" "movnti %%eax, (%1,%2)\n\t"
"sub $0x4, %2\n\t" "sub $0x4, %2\n\t"
"jnz ssse3_convert_rgba_gray8_iter\n\t" "jnz ssse3_convert_rgba_gray8_iter\n\t"
: :
: "r" (col1), "r" (result), "r" (count), "m" (*movemask) : "r" (col1), "r" (result), "r" (count)
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "cc", "memory" : "%eax", "%xmm0", "%xmm1", "%xmm3", "%xmm4", "cc", "memory"
); );
#else #else
Panic("SSE function called on a non x86\\x86-64 platform"); Panic("SSE function called on a non x86\\x86-64 platform");