Removed SSSE3 deinterlacing functions, as they were typically equal or slower than the standard code (when compiled with -O2 or better)
The function is too complicated to be vectorized efficiently
This commit is contained in:
parent
e7a681b8ff
commit
67f7ad40ae
885
src/zm_image.cpp
885
src/zm_image.cpp
|
@ -267,22 +267,17 @@ void Image::Initialise()
|
||||||
Debug(4,"Delta: CPU extensions disabled, using standard delta functions");
|
Debug(4,"Delta: CPU extensions disabled, using standard delta functions");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Use SSSE3 deinterlace functions? */
|
/*
|
||||||
if(config.cpu_extensions && sseversion >= 35) {
|
SSSE3 deinterlacing functions were removed because they were usually equal
|
||||||
fptr_deinterlace_4field_rgba = &ssse3_deinterlace_4field_rgba;
|
or slower than the standard code (compiled with -O2 or better)
|
||||||
fptr_deinterlace_4field_bgra = &ssse3_deinterlace_4field_bgra;
|
The function is too complicated to be vectorized efficiently
|
||||||
fptr_deinterlace_4field_argb = &ssse3_deinterlace_4field_argb;
|
*/
|
||||||
fptr_deinterlace_4field_abgr = &ssse3_deinterlace_4field_abgr;
|
|
||||||
fptr_deinterlace_4field_gray8 = &ssse3_deinterlace_4field_gray8;
|
|
||||||
Debug(4,"Deinterlace: Using SSSE3 delta functions");
|
|
||||||
} else {
|
|
||||||
fptr_deinterlace_4field_rgba = &std_deinterlace_4field_rgba;
|
fptr_deinterlace_4field_rgba = &std_deinterlace_4field_rgba;
|
||||||
fptr_deinterlace_4field_bgra = &std_deinterlace_4field_bgra;
|
fptr_deinterlace_4field_bgra = &std_deinterlace_4field_bgra;
|
||||||
fptr_deinterlace_4field_argb = &std_deinterlace_4field_argb;
|
fptr_deinterlace_4field_argb = &std_deinterlace_4field_argb;
|
||||||
fptr_deinterlace_4field_abgr = &std_deinterlace_4field_abgr;
|
fptr_deinterlace_4field_abgr = &std_deinterlace_4field_abgr;
|
||||||
fptr_deinterlace_4field_gray8 = &std_deinterlace_4field_gray8;
|
fptr_deinterlace_4field_gray8 = &std_deinterlace_4field_gray8;
|
||||||
Debug(4,"Deinterlace: Using standard delta functions");
|
Debug(4,"Deinterlace: Using standard functions");
|
||||||
}
|
|
||||||
|
|
||||||
/* Use SSE2 aligned memory copy? */
|
/* Use SSE2 aligned memory copy? */
|
||||||
if(config.cpu_extensions && sseversion >= 20) {
|
if(config.cpu_extensions && sseversion >= 20) {
|
||||||
|
@ -4954,871 +4949,3 @@ __attribute__((noinline)) void std_deinterlace_4field_abgr(uint8_t* col1, uint8_
|
||||||
pncurrent += 4;
|
pncurrent += 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Grayscale SSSE3 */
|
|
||||||
#if defined(__i386__) || defined(__x86_64__)
|
|
||||||
__attribute__((noinline,__target__("ssse3")))
|
|
||||||
#endif
|
|
||||||
void ssse3_deinterlace_4field_gray8(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) {
|
|
||||||
|
|
||||||
#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE))
|
|
||||||
union {
|
|
||||||
uint32_t int32;
|
|
||||||
uint8_t int8a[4];
|
|
||||||
} threshold_mask;
|
|
||||||
threshold_mask.int8a[0] = threshold;
|
|
||||||
threshold_mask.int8a[1] = 0;
|
|
||||||
threshold_mask.int8a[2] = threshold;
|
|
||||||
threshold_mask.int8a[3] = 0;
|
|
||||||
|
|
||||||
unsigned long row_width = width;
|
|
||||||
uint8_t* max_ptr = col1 + (row_width * (height-2));
|
|
||||||
uint8_t* max_ptr2 = col1 + row_width;
|
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
/* Load the threshold */
|
|
||||||
"mov %5, %%eax\n\t"
|
|
||||||
"movd %%eax, %%xmm4\n\t"
|
|
||||||
"pshufd $0x0, %%xmm4, %%xmm4\n\t"
|
|
||||||
/* Zero the temporary register */
|
|
||||||
"pxor %%xmm0, %%xmm0\n\t"
|
|
||||||
|
|
||||||
"algo_ssse3_deinterlace_4field_gray8:\n\t"
|
|
||||||
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"pmaxub %%xmm2, %%xmm1\n\t"
|
|
||||||
"pminub %%xmm5, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"pmaxub %%xmm2, %%xmm1\n\t"
|
|
||||||
"pminub %%xmm6, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
"movdqa %%xmm1, %%xmm2\n\t"
|
|
||||||
|
|
||||||
/* Do the comparison on words instead of bytes because we don't have unsigned comparison */
|
|
||||||
"punpcklbw %%xmm0, %%xmm1\n\t" // Expand pixels 0-7 into words into xmm1
|
|
||||||
"punpckhbw %%xmm0, %%xmm2\n\t" // Expand pixels 8-15 into words into xmm2
|
|
||||||
"pcmpgtw %%xmm4, %%xmm1\n\t" // Compare average delta with threshold for pixels 0-7
|
|
||||||
"pcmpgtw %%xmm4, %%xmm2\n\t" // Compare average delta with threshold for pixels 8-15
|
|
||||||
"packsswb %%xmm2, %%xmm1\n\t" // Pack the comparison results into xmm1
|
|
||||||
|
|
||||||
"movdqa (%0,%4), %%xmm2\n\t" // Load pbelow
|
|
||||||
"pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow
|
|
||||||
"pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
|
|
||||||
"sub %4, %0\n\t" // Restore pcurrent to pabove
|
|
||||||
"sub %4, %1\n\t" // Restore pncurrent to pnabove
|
|
||||||
|
|
||||||
/* Next pixels */
|
|
||||||
"add $0x10, %0\n\t" // Add 16 to pcurrent
|
|
||||||
"add $0x10, %1\n\t" // Add 16 to pncurrent
|
|
||||||
|
|
||||||
/* Check if we reached the row end */
|
|
||||||
"cmp %2, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_gray8\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t" // Add width to pcurrent
|
|
||||||
"add %4, %1\n\t" // Add width to pncurrent
|
|
||||||
"mov %0, %2\n\t"
|
|
||||||
"add %4, %2\n\t" // Add width to max_ptr2
|
|
||||||
|
|
||||||
/* Check if we reached the end */
|
|
||||||
"cmp %3, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_gray8\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Special case for the last line */
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"pmaxub %%xmm2, %%xmm1\n\t"
|
|
||||||
"pminub %%xmm5, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"pmaxub %%xmm2, %%xmm1\n\t"
|
|
||||||
"pminub %%xmm6, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
"movdqa %%xmm1, %%xmm2\n\t"
|
|
||||||
|
|
||||||
/* Do the comparison on words instead of bytes because we don't have unsigned comparison */
|
|
||||||
"punpcklbw %%xmm0, %%xmm1\n\t" // Expand pixels 0-7 into words into xmm1
|
|
||||||
"punpckhbw %%xmm0, %%xmm2\n\t" // Expand pixels 8-15 into words into xmm2
|
|
||||||
"pcmpgtw %%xmm4, %%xmm1\n\t" // Compare average delta with threshold for pixels 0-7
|
|
||||||
"pcmpgtw %%xmm4, %%xmm2\n\t" // Compare average delta with threshold for pixels 8-15
|
|
||||||
"packsswb %%xmm2, %%xmm1\n\t" // Pack the comparison results into xmm1
|
|
||||||
|
|
||||||
"pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
:
|
|
||||||
: "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_mask.int32)
|
|
||||||
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
Panic("SSE function called on a non x86\\x86-64 platform");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* RGBA SSSE3 */
|
|
||||||
#if defined(__i386__) || defined(__x86_64__)
|
|
||||||
__attribute__((noinline,__target__("ssse3")))
|
|
||||||
#endif
|
|
||||||
void ssse3_deinterlace_4field_rgba(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) {
|
|
||||||
#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE))
|
|
||||||
__attribute__((aligned(16))) static const uint8_t movemask2[16] = {1,1,1,1,1,0,0,2,9,9,9,9,9,8,8,10};
|
|
||||||
|
|
||||||
const uint32_t threshold_val = threshold;
|
|
||||||
|
|
||||||
unsigned long row_width = width*4;
|
|
||||||
uint8_t* max_ptr = col1 + (row_width * (height-2));
|
|
||||||
uint8_t* max_ptr2 = col1 + row_width;
|
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
"mov $0x1F1F1F1F, %%eax\n\t"
|
|
||||||
"movd %%eax, %%xmm4\n\t"
|
|
||||||
"pshufd $0x0, %%xmm4, %%xmm4\n\t"
|
|
||||||
"movdqa %6, %%xmm3\n\t"
|
|
||||||
"mov %5, %%eax\n\t"
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"movd %%eax, %%xmm8\n\t"
|
|
||||||
"pshufd $0x0, %%xmm8, %%xmm8\n\t"
|
|
||||||
#endif
|
|
||||||
/* Zero the temporary register */
|
|
||||||
"pxor %%xmm0, %%xmm0\n\t"
|
|
||||||
|
|
||||||
"algo_ssse3_deinterlace_4field_rgba:\n\t"
|
|
||||||
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#else
|
|
||||||
"movd %%eax, %%xmm7\n\t" // Setup the threshold
|
|
||||||
"pshufd $0x0, %%xmm7, %%xmm7\n\t"
|
|
||||||
|
|
||||||
"pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#endif
|
|
||||||
"movdqa (%0,%4), %%xmm2\n\t" // Load pbelow
|
|
||||||
"pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow
|
|
||||||
"pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
|
|
||||||
"sub %4, %0\n\t" // Restore pcurrent to pabove
|
|
||||||
"sub %4, %1\n\t" // Restore pncurrent to pnabove
|
|
||||||
|
|
||||||
/* Next pixels */
|
|
||||||
"add $0x10, %0\n\t" // Add 16 to pcurrent
|
|
||||||
"add $0x10, %1\n\t" // Add 16 to pncurrent
|
|
||||||
|
|
||||||
/* Check if we reached the row end */
|
|
||||||
"cmp %2, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_rgba\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t" // Add width to pcurrent
|
|
||||||
"add %4, %1\n\t" // Add width to pncurrent
|
|
||||||
"mov %0, %2\n\t"
|
|
||||||
"add %4, %2\n\t" // Add width to max_ptr2
|
|
||||||
|
|
||||||
/* Check if we reached the end */
|
|
||||||
"cmp %3, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_rgba\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Special case for the last line */
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#else
|
|
||||||
"movd %%eax, %%xmm7\n\t" // Setup the threshold
|
|
||||||
"pshufd $0x0, %%xmm7, %%xmm7\n\t"
|
|
||||||
|
|
||||||
"pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#endif
|
|
||||||
"pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
:
|
|
||||||
: "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_val), "m" (*movemask2)
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "cc", "memory"
|
|
||||||
#else
|
|
||||||
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"
|
|
||||||
#endif
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
Panic("SSE function called on a non x86\\x86-64 platform");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* BGRA SSSE3 */
|
|
||||||
#if defined(__i386__) || defined(__x86_64__)
|
|
||||||
__attribute__((noinline,__target__("ssse3")))
|
|
||||||
#endif
|
|
||||||
void ssse3_deinterlace_4field_bgra(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) {
|
|
||||||
#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE))
|
|
||||||
__attribute__((aligned(16))) static const uint8_t movemask2[16] = {1,1,1,1,1,2,2,0,9,9,9,9,9,10,10,8};
|
|
||||||
|
|
||||||
const uint32_t threshold_val = threshold;
|
|
||||||
|
|
||||||
unsigned long row_width = width*4;
|
|
||||||
uint8_t* max_ptr = col1 + (row_width * (height-2));
|
|
||||||
uint8_t* max_ptr2 = col1 + row_width;
|
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
"mov $0x1F1F1F1F, %%eax\n\t"
|
|
||||||
"movd %%eax, %%xmm4\n\t"
|
|
||||||
"pshufd $0x0, %%xmm4, %%xmm4\n\t"
|
|
||||||
"movdqa %6, %%xmm3\n\t"
|
|
||||||
"mov %5, %%eax\n\t"
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"movd %%eax, %%xmm8\n\t"
|
|
||||||
"pshufd $0x0, %%xmm8, %%xmm8\n\t"
|
|
||||||
#endif
|
|
||||||
/* Zero the temporary register */
|
|
||||||
"pxor %%xmm0, %%xmm0\n\t"
|
|
||||||
|
|
||||||
"algo_ssse3_deinterlace_4field_bgra:\n\t"
|
|
||||||
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#else
|
|
||||||
"movd %%eax, %%xmm7\n\t" // Setup the threshold
|
|
||||||
"pshufd $0x0, %%xmm7, %%xmm7\n\t"
|
|
||||||
|
|
||||||
"pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#endif
|
|
||||||
"movdqa (%0,%4), %%xmm2\n\t" // Load pbelow
|
|
||||||
"pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow
|
|
||||||
"pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
|
|
||||||
"sub %4, %0\n\t" // Restore pcurrent to pabove
|
|
||||||
"sub %4, %1\n\t" // Restore pncurrent to pnabove
|
|
||||||
|
|
||||||
/* Next pixels */
|
|
||||||
"add $0x10, %0\n\t" // Add 16 to pcurrent
|
|
||||||
"add $0x10, %1\n\t" // Add 16 to pncurrent
|
|
||||||
|
|
||||||
/* Check if we reached the row end */
|
|
||||||
"cmp %2, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_bgra\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t" // Add width to pcurrent
|
|
||||||
"add %4, %1\n\t" // Add width to pncurrent
|
|
||||||
"mov %0, %2\n\t"
|
|
||||||
"add %4, %2\n\t" // Add width to max_ptr2
|
|
||||||
|
|
||||||
/* Check if we reached the end */
|
|
||||||
"cmp %3, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_bgra\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Special case for the last line */
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#else
|
|
||||||
"movd %%eax, %%xmm7\n\t" // Setup the threshold
|
|
||||||
"pshufd $0x0, %%xmm7, %%xmm7\n\t"
|
|
||||||
|
|
||||||
"pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#endif
|
|
||||||
"pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
:
|
|
||||||
: "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_val), "m" (*movemask2)
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "cc", "memory"
|
|
||||||
#else
|
|
||||||
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"
|
|
||||||
#endif
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
Panic("SSE function called on a non x86\\x86-64 platform");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ARGB SSSE3 */
|
|
||||||
#if defined(__i386__) || defined(__x86_64__)
|
|
||||||
__attribute__((noinline,__target__("ssse3")))
|
|
||||||
#endif
|
|
||||||
void ssse3_deinterlace_4field_argb(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) {
|
|
||||||
#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE))
|
|
||||||
__attribute__((aligned(16))) static const uint8_t movemask2[16] = {2,2,2,2,2,1,1,3,10,10,10,10,10,9,9,11};
|
|
||||||
|
|
||||||
const uint32_t threshold_val = threshold;
|
|
||||||
|
|
||||||
unsigned long row_width = width*4;
|
|
||||||
uint8_t* max_ptr = col1 + (row_width * (height-2));
|
|
||||||
uint8_t* max_ptr2 = col1 + row_width;
|
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
"mov $0x1F1F1F1F, %%eax\n\t"
|
|
||||||
"movd %%eax, %%xmm4\n\t"
|
|
||||||
"pshufd $0x0, %%xmm4, %%xmm4\n\t"
|
|
||||||
"movdqa %6, %%xmm3\n\t"
|
|
||||||
"mov %5, %%eax\n\t"
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"movd %%eax, %%xmm8\n\t"
|
|
||||||
"pshufd $0x0, %%xmm8, %%xmm8\n\t"
|
|
||||||
#endif
|
|
||||||
/* Zero the temporary register */
|
|
||||||
"pxor %%xmm0, %%xmm0\n\t"
|
|
||||||
|
|
||||||
"algo_ssse3_deinterlace_4field_argb:\n\t"
|
|
||||||
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#else
|
|
||||||
"movd %%eax, %%xmm7\n\t" // Setup the threshold
|
|
||||||
"pshufd $0x0, %%xmm7, %%xmm7\n\t"
|
|
||||||
|
|
||||||
"pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#endif
|
|
||||||
"movdqa (%0,%4), %%xmm2\n\t" // Load pbelow
|
|
||||||
"pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow
|
|
||||||
"pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
|
|
||||||
"sub %4, %0\n\t" // Restore pcurrent to pabove
|
|
||||||
"sub %4, %1\n\t" // Restore pncurrent to pnabove
|
|
||||||
|
|
||||||
/* Next pixels */
|
|
||||||
"add $0x10, %0\n\t" // Add 16 to pcurrent
|
|
||||||
"add $0x10, %1\n\t" // Add 16 to pncurrent
|
|
||||||
|
|
||||||
/* Check if we reached the row end */
|
|
||||||
"cmp %2, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_argb\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t" // Add width to pcurrent
|
|
||||||
"add %4, %1\n\t" // Add width to pncurrent
|
|
||||||
"mov %0, %2\n\t"
|
|
||||||
"add %4, %2\n\t" // Add width to max_ptr2
|
|
||||||
|
|
||||||
/* Check if we reached the end */
|
|
||||||
"cmp %3, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_argb\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Special case for the last line */
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#else
|
|
||||||
"movd %%eax, %%xmm7\n\t" // Setup the threshold
|
|
||||||
"pshufd $0x0, %%xmm7, %%xmm7\n\t"
|
|
||||||
|
|
||||||
"pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#endif
|
|
||||||
"pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
:
|
|
||||||
: "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_val), "m" (*movemask2)
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "cc", "memory"
|
|
||||||
#else
|
|
||||||
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"
|
|
||||||
#endif
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
Panic("SSE function called on a non x86\\x86-64 platform");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ABGR SSSE3 */
|
|
||||||
#if defined(__i386__) || defined(__x86_64__)
|
|
||||||
__attribute__((noinline,__target__("ssse3")))
|
|
||||||
#endif
|
|
||||||
void ssse3_deinterlace_4field_abgr(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) {
|
|
||||||
#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE))
|
|
||||||
__attribute__((aligned(16))) static const uint8_t movemask2[16] = {2,2,2,2,2,3,3,1,10,10,10,10,10,11,11,9};
|
|
||||||
|
|
||||||
const uint32_t threshold_val = threshold;
|
|
||||||
|
|
||||||
unsigned long row_width = width*4;
|
|
||||||
uint8_t* max_ptr = col1 + (row_width * (height-2));
|
|
||||||
uint8_t* max_ptr2 = col1 + row_width;
|
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
"mov $0x1F1F1F1F, %%eax\n\t"
|
|
||||||
"movd %%eax, %%xmm4\n\t"
|
|
||||||
"pshufd $0x0, %%xmm4, %%xmm4\n\t"
|
|
||||||
"movdqa %6, %%xmm3\n\t"
|
|
||||||
"mov %5, %%eax\n\t"
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"movd %%eax, %%xmm8\n\t"
|
|
||||||
"pshufd $0x0, %%xmm8, %%xmm8\n\t"
|
|
||||||
#endif
|
|
||||||
/* Zero the temporary register */
|
|
||||||
"pxor %%xmm0, %%xmm0\n\t"
|
|
||||||
|
|
||||||
"algo_ssse3_deinterlace_4field_abgr:\n\t"
|
|
||||||
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#else
|
|
||||||
"movd %%eax, %%xmm7\n\t" // Setup the threshold
|
|
||||||
"pshufd $0x0, %%xmm7, %%xmm7\n\t"
|
|
||||||
|
|
||||||
"pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#endif
|
|
||||||
"movdqa (%0,%4), %%xmm2\n\t" // Load pbelow
|
|
||||||
"pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow
|
|
||||||
"pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
|
|
||||||
"sub %4, %0\n\t" // Restore pcurrent to pabove
|
|
||||||
"sub %4, %1\n\t" // Restore pncurrent to pnabove
|
|
||||||
|
|
||||||
/* Next pixels */
|
|
||||||
"add $0x10, %0\n\t" // Add 16 to pcurrent
|
|
||||||
"add $0x10, %1\n\t" // Add 16 to pncurrent
|
|
||||||
|
|
||||||
/* Check if we reached the row end */
|
|
||||||
"cmp %2, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_abgr\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t" // Add width to pcurrent
|
|
||||||
"add %4, %1\n\t" // Add width to pncurrent
|
|
||||||
"mov %0, %2\n\t"
|
|
||||||
"add %4, %2\n\t" // Add width to max_ptr2
|
|
||||||
|
|
||||||
/* Check if we reached the end */
|
|
||||||
"cmp %3, %0\n\t"
|
|
||||||
"jb algo_ssse3_deinterlace_4field_abgr\n\t" // Go for another iteration
|
|
||||||
|
|
||||||
/* Special case for the last line */
|
|
||||||
/* Load pabove into xmm1 and pnabove into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */
|
|
||||||
|
|
||||||
/* Next row */
|
|
||||||
"add %4, %0\n\t"
|
|
||||||
"add %4, %1\n\t"
|
|
||||||
|
|
||||||
/* Load pcurrent into xmm1 and pncurrent into xmm2 */
|
|
||||||
"movdqa (%0), %%xmm1\n\t"
|
|
||||||
"movdqa (%1), %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */
|
|
||||||
"psrlq $0x3, %%xmm1\n\t"
|
|
||||||
"psrlq $0x3, %%xmm2\n\t"
|
|
||||||
"pand %%xmm4, %%xmm1\n\t"
|
|
||||||
"pand %%xmm4, %%xmm2\n\t"
|
|
||||||
"psubb %%xmm2, %%xmm1\n\t"
|
|
||||||
"pabsb %%xmm1, %%xmm2\n\t"
|
|
||||||
"movdqa %%xmm2, %%xmm1\n\t"
|
|
||||||
"punpckldq %%xmm1, %%xmm1\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm1\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm1\n\t"
|
|
||||||
"punpckhdq %%xmm2, %%xmm2\n\t"
|
|
||||||
"pshufb %%xmm3, %%xmm2\n\t"
|
|
||||||
"psadbw %%xmm0, %%xmm2\n\t"
|
|
||||||
"packuswb %%xmm2, %%xmm1\n\t"
|
|
||||||
|
|
||||||
"pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together
|
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
"pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#else
|
|
||||||
"movd %%eax, %%xmm7\n\t" // Setup the threshold
|
|
||||||
"pshufd $0x0, %%xmm7, %%xmm7\n\t"
|
|
||||||
|
|
||||||
"pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold
|
|
||||||
#endif
|
|
||||||
"pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied
|
|
||||||
"pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced
|
|
||||||
|
|
||||||
"por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent
|
|
||||||
"movntdq %%xmm1, (%0)\n\t" // Write pcurrent
|
|
||||||
:
|
|
||||||
: "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_val), "m" (*movemask2)
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "cc", "memory"
|
|
||||||
#else
|
|
||||||
: "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"
|
|
||||||
#endif
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
Panic("SSE function called on a non x86\\x86-64 platform");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
|
@ -312,8 +312,3 @@ void std_deinterlace_4field_rgba(uint8_t* col1, uint8_t* col2, unsigned int thre
|
||||||
void std_deinterlace_4field_bgra(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
void std_deinterlace_4field_bgra(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
||||||
void std_deinterlace_4field_argb(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
void std_deinterlace_4field_argb(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
||||||
void std_deinterlace_4field_abgr(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
void std_deinterlace_4field_abgr(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
||||||
void ssse3_deinterlace_4field_gray8(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
|
||||||
void ssse3_deinterlace_4field_rgba(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
|
||||||
void ssse3_deinterlace_4field_bgra(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
|
||||||
void ssse3_deinterlace_4field_argb(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
|
||||||
void ssse3_deinterlace_4field_abgr(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);
|
|
||||||
|
|
Loading…
Reference in New Issue