From 67f7ad40aee3b52aeade4c4851f952c08666c8d7 Mon Sep 17 00:00:00 2001 From: Kfir Itzhak Date: Tue, 14 Mar 2017 22:36:26 +0200 Subject: [PATCH] Removed SSSE3 deinterlacing functions, as they were typically equal or slower than the standard code (when compiled with -O2 or better) The function is too complicated to be vectorized efficiently --- src/zm_image.cpp | 897 +---------------------------------------------- src/zm_image.h | 5 - 2 files changed, 12 insertions(+), 890 deletions(-) diff --git a/src/zm_image.cpp b/src/zm_image.cpp index aeadc4d27..7380c3d66 100644 --- a/src/zm_image.cpp +++ b/src/zm_image.cpp @@ -266,23 +266,18 @@ void Image::Initialise() fptr_delta8_gray8 = &std_delta8_gray8; Debug(4,"Delta: CPU extensions disabled, using standard delta functions"); } - - /* Use SSSE3 deinterlace functions? */ - if(config.cpu_extensions && sseversion >= 35) { - fptr_deinterlace_4field_rgba = &ssse3_deinterlace_4field_rgba; - fptr_deinterlace_4field_bgra = &ssse3_deinterlace_4field_bgra; - fptr_deinterlace_4field_argb = &ssse3_deinterlace_4field_argb; - fptr_deinterlace_4field_abgr = &ssse3_deinterlace_4field_abgr; - fptr_deinterlace_4field_gray8 = &ssse3_deinterlace_4field_gray8; - Debug(4,"Deinterlace: Using SSSE3 delta functions"); - } else { - fptr_deinterlace_4field_rgba = &std_deinterlace_4field_rgba; - fptr_deinterlace_4field_bgra = &std_deinterlace_4field_bgra; - fptr_deinterlace_4field_argb = &std_deinterlace_4field_argb; - fptr_deinterlace_4field_abgr = &std_deinterlace_4field_abgr; - fptr_deinterlace_4field_gray8 = &std_deinterlace_4field_gray8; - Debug(4,"Deinterlace: Using standard delta functions"); - } + + /* + SSSE3 deinterlacing functions were removed because they were usually equal + or slower than the standard code (compiled with -O2 or better) + The function is too complicated to be vectorized efficiently + */ + fptr_deinterlace_4field_rgba = &std_deinterlace_4field_rgba; + fptr_deinterlace_4field_bgra = &std_deinterlace_4field_bgra; + fptr_deinterlace_4field_argb = &std_deinterlace_4field_argb; + fptr_deinterlace_4field_abgr = &std_deinterlace_4field_abgr; + fptr_deinterlace_4field_gray8 = &std_deinterlace_4field_gray8; + Debug(4,"Deinterlace: Using standard functions"); /* Use SSE2 aligned memory copy? */ if(config.cpu_extensions && sseversion >= 20) { @@ -4954,871 +4949,3 @@ __attribute__((noinline)) void std_deinterlace_4field_abgr(uint8_t* col1, uint8_ pncurrent += 4; } } - -/* Grayscale SSSE3 */ -#if defined(__i386__) || defined(__x86_64__) -__attribute__((noinline,__target__("ssse3"))) -#endif -void ssse3_deinterlace_4field_gray8(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) { - -#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE)) - union { - uint32_t int32; - uint8_t int8a[4]; - } threshold_mask; - threshold_mask.int8a[0] = threshold; - threshold_mask.int8a[1] = 0; - threshold_mask.int8a[2] = threshold; - threshold_mask.int8a[3] = 0; - - unsigned long row_width = width; - uint8_t* max_ptr = col1 + (row_width * (height-2)); - uint8_t* max_ptr2 = col1 + row_width; - - __asm__ __volatile__ ( - /* Load the threshold */ - "mov %5, %%eax\n\t" - "movd %%eax, %%xmm4\n\t" - "pshufd $0x0, %%xmm4, %%xmm4\n\t" - /* Zero the temporary register */ - "pxor %%xmm0, %%xmm0\n\t" - - "algo_ssse3_deinterlace_4field_gray8:\n\t" - - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "pmaxub %%xmm2, %%xmm1\n\t" - "pminub %%xmm5, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "pmaxub %%xmm2, %%xmm1\n\t" - "pminub %%xmm6, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - "movdqa %%xmm1, %%xmm2\n\t" - - /* Do the comparison on words instead of bytes because we don't have unsigned comparison */ - "punpcklbw %%xmm0, %%xmm1\n\t" // Expand pixels 0-7 into words into xmm1 - "punpckhbw %%xmm0, %%xmm2\n\t" // Expand pixels 8-15 into words into xmm2 - "pcmpgtw %%xmm4, %%xmm1\n\t" // Compare average delta with threshold for pixels 0-7 - "pcmpgtw %%xmm4, %%xmm2\n\t" // Compare average delta with threshold for pixels 8-15 - "packsswb %%xmm2, %%xmm1\n\t" // Pack the comparison results into xmm1 - - "movdqa (%0,%4), %%xmm2\n\t" // Load pbelow - "pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow - "pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - - "sub %4, %0\n\t" // Restore pcurrent to pabove - "sub %4, %1\n\t" // Restore pncurrent to pnabove - - /* Next pixels */ - "add $0x10, %0\n\t" // Add 16 to pcurrent - "add $0x10, %1\n\t" // Add 16 to pncurrent - - /* Check if we reached the row end */ - "cmp %2, %0\n\t" - "jb algo_ssse3_deinterlace_4field_gray8\n\t" // Go for another iteration - - /* Next row */ - "add %4, %0\n\t" // Add width to pcurrent - "add %4, %1\n\t" // Add width to pncurrent - "mov %0, %2\n\t" - "add %4, %2\n\t" // Add width to max_ptr2 - - /* Check if we reached the end */ - "cmp %3, %0\n\t" - "jb algo_ssse3_deinterlace_4field_gray8\n\t" // Go for another iteration - - /* Special case for the last line */ - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "pmaxub %%xmm2, %%xmm1\n\t" - "pminub %%xmm5, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "pmaxub %%xmm2, %%xmm1\n\t" - "pminub %%xmm6, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - "movdqa %%xmm1, %%xmm2\n\t" - - /* Do the comparison on words instead of bytes because we don't have unsigned comparison */ - "punpcklbw %%xmm0, %%xmm1\n\t" // Expand pixels 0-7 into words into xmm1 - "punpckhbw %%xmm0, %%xmm2\n\t" // Expand pixels 8-15 into words into xmm2 - "pcmpgtw %%xmm4, %%xmm1\n\t" // Compare average delta with threshold for pixels 0-7 - "pcmpgtw %%xmm4, %%xmm2\n\t" // Compare average delta with threshold for pixels 8-15 - "packsswb %%xmm2, %%xmm1\n\t" // Pack the comparison results into xmm1 - - "pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - : - : "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_mask.int32) - : "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory" - ); -#else - Panic("SSE function called on a non x86\\x86-64 platform"); -#endif -} - -/* RGBA SSSE3 */ -#if defined(__i386__) || defined(__x86_64__) -__attribute__((noinline,__target__("ssse3"))) -#endif -void ssse3_deinterlace_4field_rgba(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) { -#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE)) - __attribute__((aligned(16))) static const uint8_t movemask2[16] = {1,1,1,1,1,0,0,2,9,9,9,9,9,8,8,10}; - - const uint32_t threshold_val = threshold; - - unsigned long row_width = width*4; - uint8_t* max_ptr = col1 + (row_width * (height-2)); - uint8_t* max_ptr2 = col1 + row_width; - - __asm__ __volatile__ ( - "mov $0x1F1F1F1F, %%eax\n\t" - "movd %%eax, %%xmm4\n\t" - "pshufd $0x0, %%xmm4, %%xmm4\n\t" - "movdqa %6, %%xmm3\n\t" - "mov %5, %%eax\n\t" -#if defined(__x86_64__) - "movd %%eax, %%xmm8\n\t" - "pshufd $0x0, %%xmm8, %%xmm8\n\t" -#endif - /* Zero the temporary register */ - "pxor %%xmm0, %%xmm0\n\t" - - "algo_ssse3_deinterlace_4field_rgba:\n\t" - - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - -#if defined(__x86_64__) - "pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold -#else - "movd %%eax, %%xmm7\n\t" // Setup the threshold - "pshufd $0x0, %%xmm7, %%xmm7\n\t" - - "pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold -#endif - "movdqa (%0,%4), %%xmm2\n\t" // Load pbelow - "pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow - "pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - - "sub %4, %0\n\t" // Restore pcurrent to pabove - "sub %4, %1\n\t" // Restore pncurrent to pnabove - - /* Next pixels */ - "add $0x10, %0\n\t" // Add 16 to pcurrent - "add $0x10, %1\n\t" // Add 16 to pncurrent - - /* Check if we reached the row end */ - "cmp %2, %0\n\t" - "jb algo_ssse3_deinterlace_4field_rgba\n\t" // Go for another iteration - - /* Next row */ - "add %4, %0\n\t" // Add width to pcurrent - "add %4, %1\n\t" // Add width to pncurrent - "mov %0, %2\n\t" - "add %4, %2\n\t" // Add width to max_ptr2 - - /* Check if we reached the end */ - "cmp %3, %0\n\t" - "jb algo_ssse3_deinterlace_4field_rgba\n\t" // Go for another iteration - - /* Special case for the last line */ - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - -#if defined(__x86_64__) - "pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold -#else - "movd %%eax, %%xmm7\n\t" // Setup the threshold - "pshufd $0x0, %%xmm7, %%xmm7\n\t" - - "pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold -#endif - "pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - : - : "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_val), "m" (*movemask2) -#if defined(__x86_64__) - : "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "cc", "memory" -#else - : "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory" -#endif - ); -#else - Panic("SSE function called on a non x86\\x86-64 platform"); -#endif -} - -/* BGRA SSSE3 */ -#if defined(__i386__) || defined(__x86_64__) -__attribute__((noinline,__target__("ssse3"))) -#endif -void ssse3_deinterlace_4field_bgra(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) { -#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE)) - __attribute__((aligned(16))) static const uint8_t movemask2[16] = {1,1,1,1,1,2,2,0,9,9,9,9,9,10,10,8}; - - const uint32_t threshold_val = threshold; - - unsigned long row_width = width*4; - uint8_t* max_ptr = col1 + (row_width * (height-2)); - uint8_t* max_ptr2 = col1 + row_width; - - __asm__ __volatile__ ( - "mov $0x1F1F1F1F, %%eax\n\t" - "movd %%eax, %%xmm4\n\t" - "pshufd $0x0, %%xmm4, %%xmm4\n\t" - "movdqa %6, %%xmm3\n\t" - "mov %5, %%eax\n\t" -#if defined(__x86_64__) - "movd %%eax, %%xmm8\n\t" - "pshufd $0x0, %%xmm8, %%xmm8\n\t" -#endif - /* Zero the temporary register */ - "pxor %%xmm0, %%xmm0\n\t" - - "algo_ssse3_deinterlace_4field_bgra:\n\t" - - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - -#if defined(__x86_64__) - "pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold -#else - "movd %%eax, %%xmm7\n\t" // Setup the threshold - "pshufd $0x0, %%xmm7, %%xmm7\n\t" - - "pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold -#endif - "movdqa (%0,%4), %%xmm2\n\t" // Load pbelow - "pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow - "pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - - "sub %4, %0\n\t" // Restore pcurrent to pabove - "sub %4, %1\n\t" // Restore pncurrent to pnabove - - /* Next pixels */ - "add $0x10, %0\n\t" // Add 16 to pcurrent - "add $0x10, %1\n\t" // Add 16 to pncurrent - - /* Check if we reached the row end */ - "cmp %2, %0\n\t" - "jb algo_ssse3_deinterlace_4field_bgra\n\t" // Go for another iteration - - /* Next row */ - "add %4, %0\n\t" // Add width to pcurrent - "add %4, %1\n\t" // Add width to pncurrent - "mov %0, %2\n\t" - "add %4, %2\n\t" // Add width to max_ptr2 - - /* Check if we reached the end */ - "cmp %3, %0\n\t" - "jb algo_ssse3_deinterlace_4field_bgra\n\t" // Go for another iteration - - /* Special case for the last line */ - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - -#if defined(__x86_64__) - "pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold -#else - "movd %%eax, %%xmm7\n\t" // Setup the threshold - "pshufd $0x0, %%xmm7, %%xmm7\n\t" - - "pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold -#endif - "pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - : - : "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_val), "m" (*movemask2) -#if defined(__x86_64__) - : "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "cc", "memory" -#else - : "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory" -#endif - ); -#else - Panic("SSE function called on a non x86\\x86-64 platform"); -#endif -} - -/* ARGB SSSE3 */ -#if defined(__i386__) || defined(__x86_64__) -__attribute__((noinline,__target__("ssse3"))) -#endif -void ssse3_deinterlace_4field_argb(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) { -#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE)) - __attribute__((aligned(16))) static const uint8_t movemask2[16] = {2,2,2,2,2,1,1,3,10,10,10,10,10,9,9,11}; - - const uint32_t threshold_val = threshold; - - unsigned long row_width = width*4; - uint8_t* max_ptr = col1 + (row_width * (height-2)); - uint8_t* max_ptr2 = col1 + row_width; - - __asm__ __volatile__ ( - "mov $0x1F1F1F1F, %%eax\n\t" - "movd %%eax, %%xmm4\n\t" - "pshufd $0x0, %%xmm4, %%xmm4\n\t" - "movdqa %6, %%xmm3\n\t" - "mov %5, %%eax\n\t" -#if defined(__x86_64__) - "movd %%eax, %%xmm8\n\t" - "pshufd $0x0, %%xmm8, %%xmm8\n\t" -#endif - /* Zero the temporary register */ - "pxor %%xmm0, %%xmm0\n\t" - - "algo_ssse3_deinterlace_4field_argb:\n\t" - - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - -#if defined(__x86_64__) - "pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold -#else - "movd %%eax, %%xmm7\n\t" // Setup the threshold - "pshufd $0x0, %%xmm7, %%xmm7\n\t" - - "pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold -#endif - "movdqa (%0,%4), %%xmm2\n\t" // Load pbelow - "pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow - "pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - - "sub %4, %0\n\t" // Restore pcurrent to pabove - "sub %4, %1\n\t" // Restore pncurrent to pnabove - - /* Next pixels */ - "add $0x10, %0\n\t" // Add 16 to pcurrent - "add $0x10, %1\n\t" // Add 16 to pncurrent - - /* Check if we reached the row end */ - "cmp %2, %0\n\t" - "jb algo_ssse3_deinterlace_4field_argb\n\t" // Go for another iteration - - /* Next row */ - "add %4, %0\n\t" // Add width to pcurrent - "add %4, %1\n\t" // Add width to pncurrent - "mov %0, %2\n\t" - "add %4, %2\n\t" // Add width to max_ptr2 - - /* Check if we reached the end */ - "cmp %3, %0\n\t" - "jb algo_ssse3_deinterlace_4field_argb\n\t" // Go for another iteration - - /* Special case for the last line */ - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - -#if defined(__x86_64__) - "pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold -#else - "movd %%eax, %%xmm7\n\t" // Setup the threshold - "pshufd $0x0, %%xmm7, %%xmm7\n\t" - - "pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold -#endif - "pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - : - : "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_val), "m" (*movemask2) -#if defined(__x86_64__) - : "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "cc", "memory" -#else - : "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory" -#endif - ); -#else - Panic("SSE function called on a non x86\\x86-64 platform"); -#endif -} - -/* ABGR SSSE3 */ -#if defined(__i386__) || defined(__x86_64__) -__attribute__((noinline,__target__("ssse3"))) -#endif -void ssse3_deinterlace_4field_abgr(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height) { -#if ((defined(__i386__) || defined(__x86_64__) || defined(ZM_KEEP_SSE)) && !defined(ZM_STRIP_SSE)) - __attribute__((aligned(16))) static const uint8_t movemask2[16] = {2,2,2,2,2,3,3,1,10,10,10,10,10,11,11,9}; - - const uint32_t threshold_val = threshold; - - unsigned long row_width = width*4; - uint8_t* max_ptr = col1 + (row_width * (height-2)); - uint8_t* max_ptr2 = col1 + row_width; - - __asm__ __volatile__ ( - "mov $0x1F1F1F1F, %%eax\n\t" - "movd %%eax, %%xmm4\n\t" - "pshufd $0x0, %%xmm4, %%xmm4\n\t" - "movdqa %6, %%xmm3\n\t" - "mov %5, %%eax\n\t" -#if defined(__x86_64__) - "movd %%eax, %%xmm8\n\t" - "pshufd $0x0, %%xmm8, %%xmm8\n\t" -#endif - /* Zero the temporary register */ - "pxor %%xmm0, %%xmm0\n\t" - - "algo_ssse3_deinterlace_4field_abgr:\n\t" - - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - -#if defined(__x86_64__) - "pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold -#else - "movd %%eax, %%xmm7\n\t" // Setup the threshold - "pshufd $0x0, %%xmm7, %%xmm7\n\t" - - "pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold -#endif - "movdqa (%0,%4), %%xmm2\n\t" // Load pbelow - "pavgb %%xmm5, %%xmm2\n\t" // Average pabove and pbelow - "pand %%xmm1, %%xmm2\n\t" // Filter out pixels in avg that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm2, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - - "sub %4, %0\n\t" // Restore pcurrent to pabove - "sub %4, %1\n\t" // Restore pncurrent to pnabove - - /* Next pixels */ - "add $0x10, %0\n\t" // Add 16 to pcurrent - "add $0x10, %1\n\t" // Add 16 to pncurrent - - /* Check if we reached the row end */ - "cmp %2, %0\n\t" - "jb algo_ssse3_deinterlace_4field_abgr\n\t" // Go for another iteration - - /* Next row */ - "add %4, %0\n\t" // Add width to pcurrent - "add %4, %1\n\t" // Add width to pncurrent - "mov %0, %2\n\t" - "add %4, %2\n\t" // Add width to max_ptr2 - - /* Check if we reached the end */ - "cmp %3, %0\n\t" - "jb algo_ssse3_deinterlace_4field_abgr\n\t" // Go for another iteration - - /* Special case for the last line */ - /* Load pabove into xmm1 and pnabove into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm5\n\t" /* Keep backup of pabove in xmm5 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - "movdqa %%xmm1, %%xmm7\n\t" /* Backup of delta2 in xmm7 for now */ - - /* Next row */ - "add %4, %0\n\t" - "add %4, %1\n\t" - - /* Load pcurrent into xmm1 and pncurrent into xmm2 */ - "movdqa (%0), %%xmm1\n\t" - "movdqa (%1), %%xmm2\n\t" - "movdqa %%xmm1, %%xmm6\n\t" /* Keep backup of pcurrent in xmm6 */ - "psrlq $0x3, %%xmm1\n\t" - "psrlq $0x3, %%xmm2\n\t" - "pand %%xmm4, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "psubb %%xmm2, %%xmm1\n\t" - "pabsb %%xmm1, %%xmm2\n\t" - "movdqa %%xmm2, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "psadbw %%xmm0, %%xmm1\n\t" - "punpckhdq %%xmm2, %%xmm2\n\t" - "pshufb %%xmm3, %%xmm2\n\t" - "psadbw %%xmm0, %%xmm2\n\t" - "packuswb %%xmm2, %%xmm1\n\t" - - "pavgb %%xmm7, %%xmm1\n\t" // Average the two deltas together - -#if defined(__x86_64__) - "pcmpgtd %%xmm8, %%xmm1\n\t" // Compare average delta with the threshold -#else - "movd %%eax, %%xmm7\n\t" // Setup the threshold - "pshufd $0x0, %%xmm7, %%xmm7\n\t" - - "pcmpgtd %%xmm7, %%xmm1\n\t" // Compare average delta with the threshold -#endif - "pand %%xmm1, %%xmm5\n\t" // Filter out pixels in pabove that shouldn't be copied - "pandn %%xmm6, %%xmm1\n\t" // Filter out pixels in pcurrent that should be replaced - - "por %%xmm5, %%xmm1\n\t" // Put the new values in pcurrent - "movntdq %%xmm1, (%0)\n\t" // Write pcurrent - : - : "r" (col1), "r" (col2), "r" (max_ptr2), "r" (max_ptr), "r" (row_width), "m" (threshold_val), "m" (*movemask2) -#if defined(__x86_64__) - : "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "cc", "memory" -#else - : "%eax", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory" -#endif - ); -#else - Panic("SSE function called on a non x86\\x86-64 platform"); -#endif -} diff --git a/src/zm_image.h b/src/zm_image.h index 3c448d3af..539f85b67 100644 --- a/src/zm_image.h +++ b/src/zm_image.h @@ -312,8 +312,3 @@ void std_deinterlace_4field_rgba(uint8_t* col1, uint8_t* col2, unsigned int thre void std_deinterlace_4field_bgra(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height); void std_deinterlace_4field_argb(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height); void std_deinterlace_4field_abgr(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height); -void ssse3_deinterlace_4field_gray8(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height); -void ssse3_deinterlace_4field_rgba(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height); -void ssse3_deinterlace_4field_bgra(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height); -void ssse3_deinterlace_4field_argb(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height); -void ssse3_deinterlace_4field_abgr(uint8_t* col1, uint8_t* col2, unsigned int threshold, unsigned int width, unsigned int height);