A big performance improvement for SSE2 delta (32bit RGB only). Should be useful on processors without SSSE3, such as many AMD processors.

This commit is contained in:
Kfir Itzhak 2012-03-22 08:41:17 +02:00
parent 0932f50d80
commit 9dbf00ebd7
1 changed files with 12 additions and 68 deletions

View File

@ -3047,9 +3047,6 @@ __attribute__((noinline,__target__("sse2"))) void sse2_delta8_rgba(const uint8_t
"mov $0xff, %%eax\n\t" "mov $0xff, %%eax\n\t"
"movd %%eax, %%xmm0\n\t" "movd %%eax, %%xmm0\n\t"
"pshufd $0x0, %%xmm0, %%xmm0\n\t" "pshufd $0x0, %%xmm0, %%xmm0\n\t"
"mov $0x80000000, %%eax\n\t"
"movd %%eax, %%xmm5\n\t"
"pshufd $0x0, %%xmm5, %%xmm5\n\t"
"sub $0x10, %0\n\t" "sub $0x10, %0\n\t"
"sub $0x10, %1\n\t" "sub $0x10, %1\n\t"
"sub $0x4, %2\n\t" "sub $0x4, %2\n\t"
@ -3079,20 +3076,9 @@ __attribute__((noinline,__target__("sse2"))) void sse2_delta8_rgba(const uint8_t
"psrld $0x10, %%xmm2\n\t" "psrld $0x10, %%xmm2\n\t"
"pand %%xmm0, %%xmm2\n\t" "pand %%xmm0, %%xmm2\n\t"
"paddd %%xmm2, %%xmm1\n\t" "paddd %%xmm2, %%xmm1\n\t"
"movdqa %%xmm1, %%xmm3\n\t" "packssdw %%xmm1, %%xmm1\n\t"
"movdqa %%xmm1, %%xmm2\n\t" "packuswb %%xmm1, %%xmm1\n\t"
"pand %%xmm0, %%xmm3\n\t" "movd %%xmm1, %%eax\n\t"
"pand %%xmm0, %%xmm2\n\t"
"psrldq $0x9, %%xmm2\n\t"
"por %%xmm2, %%xmm3\n\t"
"movdqa %%xmm1, %%xmm2\n\t"
"pand %%xmm0, %%xmm2\n\t"
"psrldq $0x6, %%xmm2\n\t"
"por %%xmm2, %%xmm3\n\t"
"pand %%xmm0, %%xmm1\n\t"
"psrldq $0x3, %%xmm1\n\t"
"por %%xmm1, %%xmm3\n\t"
"movd %%xmm3, %%eax\n\t"
"movnti %%eax, (%2,%3)\n\t" "movnti %%eax, (%2,%3)\n\t"
"sub $0x4, %3\n\t" "sub $0x4, %3\n\t"
"jnz sse2_delta8_rgba_iter\n\t" "jnz sse2_delta8_rgba_iter\n\t"
@ -3116,9 +3102,6 @@ __attribute__((noinline,__target__("sse2"))) void sse2_delta8_bgra(const uint8_t
"mov $0xff, %%eax\n\t" "mov $0xff, %%eax\n\t"
"movd %%eax, %%xmm0\n\t" "movd %%eax, %%xmm0\n\t"
"pshufd $0x0, %%xmm0, %%xmm0\n\t" "pshufd $0x0, %%xmm0, %%xmm0\n\t"
"mov $0x80000000, %%eax\n\t"
"movd %%eax, %%xmm5\n\t"
"pshufd $0x0, %%xmm5, %%xmm5\n\t"
"sub $0x10, %0\n\t" "sub $0x10, %0\n\t"
"sub $0x10, %1\n\t" "sub $0x10, %1\n\t"
"sub $0x4, %2\n\t" "sub $0x4, %2\n\t"
@ -3148,20 +3131,9 @@ __attribute__((noinline,__target__("sse2"))) void sse2_delta8_bgra(const uint8_t
"pand %%xmm0, %%xmm2\n\t" "pand %%xmm0, %%xmm2\n\t"
"paddd %%xmm2, %%xmm2\n\t" "paddd %%xmm2, %%xmm2\n\t"
"paddd %%xmm2, %%xmm1\n\t" "paddd %%xmm2, %%xmm1\n\t"
"movdqa %%xmm1, %%xmm3\n\t" "packssdw %%xmm1, %%xmm1\n\t"
"movdqa %%xmm1, %%xmm2\n\t" "packuswb %%xmm1, %%xmm1\n\t"
"pand %%xmm0, %%xmm3\n\t" "movd %%xmm1, %%eax\n\t"
"pand %%xmm0, %%xmm2\n\t"
"psrldq $0x9, %%xmm2\n\t"
"por %%xmm2, %%xmm3\n\t"
"movdqa %%xmm1, %%xmm2\n\t"
"pand %%xmm0, %%xmm2\n\t"
"psrldq $0x6, %%xmm2\n\t"
"por %%xmm2, %%xmm3\n\t"
"pand %%xmm0, %%xmm1\n\t"
"psrldq $0x3, %%xmm1\n\t"
"por %%xmm1, %%xmm3\n\t"
"movd %%xmm3, %%eax\n\t"
"movnti %%eax, (%2,%3)\n\t" "movnti %%eax, (%2,%3)\n\t"
"sub $0x4, %3\n\t" "sub $0x4, %3\n\t"
"jnz sse2_delta8_bgra_iter\n\t" "jnz sse2_delta8_bgra_iter\n\t"
@ -3185,9 +3157,6 @@ __attribute__((noinline,__target__("sse2"))) void sse2_delta8_argb(const uint8_t
"mov $0xff, %%eax\n\t" "mov $0xff, %%eax\n\t"
"movd %%eax, %%xmm0\n\t" "movd %%eax, %%xmm0\n\t"
"pshufd $0x0, %%xmm0, %%xmm0\n\t" "pshufd $0x0, %%xmm0, %%xmm0\n\t"
"mov $0x80000000, %%eax\n\t"
"movd %%eax, %%xmm5\n\t"
"pshufd $0x0, %%xmm5, %%xmm5\n\t"
"sub $0x10, %0\n\t" "sub $0x10, %0\n\t"
"sub $0x10, %1\n\t" "sub $0x10, %1\n\t"
"sub $0x4, %2\n\t" "sub $0x4, %2\n\t"
@ -3218,20 +3187,9 @@ __attribute__((noinline,__target__("sse2"))) void sse2_delta8_argb(const uint8_t
"psrld $0x18, %%xmm2\n\t" "psrld $0x18, %%xmm2\n\t"
"pand %%xmm0, %%xmm2\n\t" "pand %%xmm0, %%xmm2\n\t"
"paddd %%xmm2, %%xmm1\n\t" "paddd %%xmm2, %%xmm1\n\t"
"movdqa %%xmm1, %%xmm3\n\t" "packssdw %%xmm1, %%xmm1\n\t"
"movdqa %%xmm1, %%xmm2\n\t" "packuswb %%xmm1, %%xmm1\n\t"
"pand %%xmm0, %%xmm3\n\t" "movd %%xmm1, %%eax\n\t"
"pand %%xmm0, %%xmm2\n\t"
"psrldq $0x9, %%xmm2\n\t"
"por %%xmm2, %%xmm3\n\t"
"movdqa %%xmm1, %%xmm2\n\t"
"pand %%xmm0, %%xmm2\n\t"
"psrldq $0x6, %%xmm2\n\t"
"por %%xmm2, %%xmm3\n\t"
"pand %%xmm0, %%xmm1\n\t"
"psrldq $0x3, %%xmm1\n\t"
"por %%xmm1, %%xmm3\n\t"
"movd %%xmm3, %%eax\n\t"
"movnti %%eax, (%2,%3)\n\t" "movnti %%eax, (%2,%3)\n\t"
"sub $0x4, %3\n\t" "sub $0x4, %3\n\t"
"jnz sse2_delta8_argb_iter\n\t" "jnz sse2_delta8_argb_iter\n\t"
@ -3255,9 +3213,6 @@ __attribute__((noinline,__target__("sse2"))) void sse2_delta8_abgr(const uint8_t
"mov $0xff, %%eax\n\t" "mov $0xff, %%eax\n\t"
"movd %%eax, %%xmm0\n\t" "movd %%eax, %%xmm0\n\t"
"pshufd $0x0, %%xmm0, %%xmm0\n\t" "pshufd $0x0, %%xmm0, %%xmm0\n\t"
"mov $0x80000000, %%eax\n\t"
"movd %%eax, %%xmm5\n\t"
"pshufd $0x0, %%xmm5, %%xmm5\n\t"
"sub $0x10, %0\n\t" "sub $0x10, %0\n\t"
"sub $0x10, %1\n\t" "sub $0x10, %1\n\t"
"sub $0x4, %2\n\t" "sub $0x4, %2\n\t"
@ -3288,20 +3243,9 @@ __attribute__((noinline,__target__("sse2"))) void sse2_delta8_abgr(const uint8_t
"pand %%xmm0, %%xmm2\n\t" "pand %%xmm0, %%xmm2\n\t"
"paddd %%xmm2, %%xmm2\n\t" "paddd %%xmm2, %%xmm2\n\t"
"paddd %%xmm2, %%xmm1\n\t" "paddd %%xmm2, %%xmm1\n\t"
"movdqa %%xmm1, %%xmm3\n\t" "packssdw %%xmm1, %%xmm1\n\t"
"movdqa %%xmm1, %%xmm2\n\t" "packuswb %%xmm1, %%xmm1\n\t"
"pand %%xmm0, %%xmm3\n\t" "movd %%xmm1, %%eax\n\t"
"pand %%xmm0, %%xmm2\n\t"
"psrldq $0x9, %%xmm2\n\t"
"por %%xmm2, %%xmm3\n\t"
"movdqa %%xmm1, %%xmm2\n\t"
"pand %%xmm0, %%xmm2\n\t"
"psrldq $0x6, %%xmm2\n\t"
"por %%xmm2, %%xmm3\n\t"
"pand %%xmm0, %%xmm1\n\t"
"psrldq $0x3, %%xmm1\n\t"
"por %%xmm1, %%xmm3\n\t"
"movd %%xmm3, %%eax\n\t"
"movnti %%eax, (%2,%3)\n\t" "movnti %%eax, (%2,%3)\n\t"
"sub $0x4, %3\n\t" "sub $0x4, %3\n\t"
"jnz sse2_delta8_abgr_iter\n\t" "jnz sse2_delta8_abgr_iter\n\t"