Merge branch 'mastertheknife-armv7_neon'

2017-05-10 07:47:04 -05:00 · 2017-05-10 07:47:04 -05:00 · 021c136640
parent e6c60737e9 ccc2cc832f
commit 021c136640
11 changed files with 401 additions and 45 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -66,6 +66,24 @@ set(CMAKE_CXX_FLAGS_DEBUG "-Wall -D__STDC_CONSTANT_MACROS -g")
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")

+# GCC below 6.0 doesn't support __target__("fpu=neon") attribute, required for compiling ARM Neon code, otherwise compilation fails.
+# Must use -mfpu=neon compiler flag instead, but only do that for processors that support neon, otherwise strip the neon code alltogether,
+# because passing -fmpu=neon is unsafe to processors that don't support neon
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
+	IF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
+		EXEC_PROGRAM(grep ARGS " neon " "/proc/cpuinfo" OUTPUT_VARIABLE neonoutput RETURN_VALUE neonresult)
+		IF(neonresult EQUAL 0)
+			set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -mfpu=neon")
+			set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfpu=neon")
+			set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mfpu=neon")
+			set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mfpu=neon")
+		ELSE(neonresult EQUAL 0)
+			add_definitions(-DZM_STRIP_NEON=1)
+			message(STATUS "ARM Neon is not available on this processor. Neon functions will be absent")
+		ENDIF(neonresult EQUAL 0)
+	ENDIF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
+ENDIF(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
+
 # Modules that we need:
 include (GNUInstallDirs)
 include (CheckIncludeFile)
--- a/src/zm_camera.cpp
+++ b/src/zm_camera.cpp
@ -39,10 +39,10 @@ Camera::Camera( int p_id, SourceType p_type, int p_width, int p_height, int p_co
  Debug(2,"New camera id: %d width: %d height: %d colours: %d subpixelorder: %d capture: %d",id,width,height,colours,subpixelorder,capture);
  
  /* Because many loops are unrolled and work on 16 colours/time or 4 pixels/time, we have to meet requirements */
-  if((colours == ZM_COLOUR_GRAY8 || colours == ZM_COLOUR_RGB32) && (imagesize % 16) != 0) {
-    Fatal("Image size is not multiples of 16");
-  } else if(colours == ZM_COLOUR_RGB24 && ((imagesize % 16) != 0 || (imagesize % 12) != 0)) {
-    Fatal("Image size is not multiples of 12 and 16");
+  if((colours == ZM_COLOUR_GRAY8 || colours == ZM_COLOUR_RGB32) && (imagesize % 64) != 0) {
+    Fatal("Image size is not multiples of 64");
+  } else if(colours == ZM_COLOUR_RGB24 && ((imagesize % 64) != 0 || (imagesize % 12) != 0)) {
+    Fatal("Image size is not multiples of 12 and 64");
  }
 }

--- a/src/zm_image.cpp
+++ b/src/zm_image.cpp
@ -195,6 +195,9 @@ void Image::Initialise()
    if(config.cpu_extensions && sseversion >= 20) {
      fptr_blend = &sse2_fastblend; /* SSE2 fast blend */
      Debug(4,"Blend: Using SSE2 fast blend function");
+    } else if(config.cpu_extensions && neonversion >= 1) {
+      fptr_blend = &neon32_armv7_fastblend;  /* ARM Neon fast blend */
+      Debug(4,"Blend: Using ARM Neon fast blend function");
    } else {
      fptr_blend = &std_fastblend;  /* standard fast blend */
      Debug(4,"Blend: Using fast blend function");
@ -204,15 +207,31 @@ void Image::Initialise()
    Debug(4,"Blend: Using standard blend function");
  }
  
-  __attribute__((aligned(16))) uint8_t blend1[16] = {142,255,159,91,88,227,0,52,37,80,152,97,104,252,90,82};
-  __attribute__((aligned(16))) uint8_t blend2[16] = {129,56,136,96,119,149,94,29,96,176,1,144,230,203,111,172};
-  __attribute__((aligned(16))) uint8_t blendres[16];
-  __attribute__((aligned(16))) uint8_t blendexp[16] = {141,231,157,92,91,217,11,49,45,92,133,103,119,246,92,93}; /* Expected results for 12.5% blend */
-  
-  (*fptr_blend)(blend1,blend2,blendres,16,12.5);
-  
+  __attribute__((aligned(64))) uint8_t blend1[128] = {
+    86,58,54,63,149,62,209,34,148,46,186,176,9,236,193,254,113,146,228,220,123,164,92,98,9,72,67,156,63,118,96,167,
+    48,224,106,176,201,245,223,219,198,50,100,31,68,77,33,76,166,90,254,128,191,82,84,32,3,171,147,248,14,196,141,179,
+    79,237,121,11,132,37,194,225,45,171,169,167,56,64,193,85,147,33,97,221,94,97,90,44,191,248,65,8,17,240,167,207,
+    224,23,71,74,81,1,46,110,227,94,163,170,55,155,52,147,224,154,237,35,255,26,229,11,223,242,118,155,82,37,189,2
+  };
+  __attribute__((aligned(64))) uint8_t blend2[128] = {
+    92,188,203,118,121,231,252,218,126,88,80,72,123,16,91,131,109,0,57,56,95,204,74,8,137,94,6,69,18,146,229,194,
+    146,230,13,146,95,48,185,65,162,47,152,172,184,111,245,143,247,105,49,42,89,37,145,255,221,200,103,80,98,39,14,227,
+    227,46,46,59,248,7,83,20,157,79,36,161,237,55,77,175,232,200,38,170,198,239,89,19,82,88,130,120,203,184,141,117,
+    228,140,150,107,103,195,74,130,42,11,150,70,176,204,198,188,38,252,174,104,128,106,31,17,141,231,62,104,179,29,143,130
+  };
+  __attribute__((aligned(64))) uint8_t blendexp[128] = {
+    86,73,71,69,145,82,214,56,145,51,173,163,22,209,180,239,112,128,207,200,119,168,89,87,24,74,59,145,57,121,111,170,
+    59,224,94,172,188,221,218,200,193,49,106,47,81,81,58,84,175,91,229,117,178,76,91,58,29,174,141,227,24,177,125,184,
+    96,214,112,16,145,33,180,200,58,159,153,166,77,62,179,95,157,53,89,214,106,114,89,41,177,228,72,21,39,233,163,196,
+    224,37,80,77,83,24,49,112,204,84,161,158,69,160,69,151,201,165,229,43,239,35,205,11,213,240,111,148,93,36,183,17
+  };
+  __attribute__((aligned(64))) uint8_t blendres[128];
+
+  /* Run the blend function */
+  (*fptr_blend)(blend1,blend2,blendres,128,12.0);
+
  /* Compare results with expected results */
-  for(int i=0;i<16;i++) {
+  for(int i=0;i<128;i++) {
    if(abs(blendexp[i] - blendres[i]) > 3) {
      Panic("Blend function failed self-test: Results differ from the expected results");
    }
@ -248,6 +267,14 @@ void Image::Initialise()
      // fptr_delta8_abgr = &std_delta8_abgr;
      fptr_delta8_gray8 = &sse2_delta8_gray8;
      Debug(4,"Delta: Using SSE2 delta functions");
+    } else if(neonversion >= 1) {
+      /* ARM Neon available */
+      fptr_delta8_rgba = &neon32_armv7_delta8_rgba;
+      fptr_delta8_bgra = &neon32_armv7_delta8_bgra;
+      fptr_delta8_argb = &neon32_armv7_delta8_argb;
+      fptr_delta8_abgr = &neon32_armv7_delta8_abgr;
+      fptr_delta8_gray8 = &neon32_armv7_delta8_gray8;
+      Debug(4,"Delta: Using ARM Neon delta functions");
    } else {
      /* No suitable SSE version available */
      fptr_delta8_rgba = &std_delta8_rgba;
@ -280,6 +307,68 @@ void Image::Initialise()
  Debug(4,"Deinterlace: Using standard functions");
  
 #if defined(__i386__) && !defined(__x86_64__)
+
+  __attribute__((aligned(64))) uint8_t delta8_1[128] = {
+    221,22,234,254,8,140,15,28,166,13,203,56,92,250,79,225,19,59,241,145,253,33,87,204,97,168,229,180,3,108,205,177,
+    41,108,65,149,4,87,16,240,56,50,135,64,153,3,219,214,239,55,169,180,167,45,243,56,191,119,145,250,102,145,73,32,
+    207,213,189,167,147,83,217,30,113,51,142,125,219,97,60,5,135,195,95,133,21,197,150,82,134,93,198,97,97,49,117,24,
+    242,253,242,5,190,71,182,1,0,69,25,181,139,84,242,79,150,158,29,215,98,100,245,16,86,165,18,98,46,100,139,19
+  };
+  __attribute__((aligned(64))) uint8_t delta8_2[128] = {
+    236,22,153,161,50,141,15,130,89,251,33,5,140,201,225,194,138,76,248,89,25,26,29,93,250,251,48,157,41,126,140,152,
+    170,177,134,14,234,99,3,105,217,76,38,233,89,30,93,48,234,40,202,80,184,4,250,71,183,249,76,78,184,148,185,120,
+    137,214,238,57,50,93,29,60,99,207,40,15,43,28,177,118,60,231,90,47,198,251,250,241,212,114,249,17,95,161,216,218,
+    51,178,137,161,213,108,35,72,65,24,5,176,110,15,0,2,137,58,0,133,197,1,122,169,175,33,223,138,37,114,52,186
+  };
+  __attribute__((aligned(64))) uint8_t delta8_gray8_exp[128] = {
+    15,0,81,93,42,1,0,102,77,238,170,51,48,49,146,31,119,17,7,56,228,7,58,111,153,83,181,23,38,18,65,25,
+    129,69,69,135,230,12,13,135,161,26,97,169,64,27,126,166,5,15,33,100,17,41,7,15,8,130,69,172,82,3,112,88,
+    70,1,49,110,97,10,188,30,14,156,102,110,176,69,117,113,75,36,5,86,177,54,100,159,78,21,51,80,2,112,99,194,
+    191,75,105,156,23,37,147,71,65,45,20,5,29,69,242,77,13,100,29,82,99,99,123,153,89,132,205,40,9,14,87,167
+  };
+  __attribute__((aligned(64))) uint8_t delta8_rgba_exp[32] = {
+    73,25,148,105,20,64,129,49,85,43,106,123,47,13,102,92,58,126,110,110,29,109,54,124,114,114,19,179,51,127,154,97
+  };
+  __attribute__((aligned(64))) uint8_t delta8_gray8_res[128];
+  __attribute__((aligned(64))) uint8_t delta8_rgba_res[128];
+
+  /* Run the delta8 grayscale function */
+  (*fptr_delta8_gray8)(delta8_1,delta8_2,delta8_gray8_exp,128);
+
+  /* Compare results with expected results */
+  for(int i=0;i<128;i++) {
+    if(abs(delta8_gray8_exp[i] - delta8_gray8_res[i]) > 7) {
+      Panic("Delta grayscale function failed self-test: Results differ from the expected results");
+    }
+  }
+
+  /* Run the delta8 RGBA function */
+  (*fptr_delta8_rgba)(delta8_1,delta8_2,delta8_rgba_exp,32);
+
+  /* Compare results with expected results */
+  for(int i=0;i<32;i++) {
+    if(abs(delta8_rgba_exp[i] - delta8_rgba_res[i]) > 7) {
+      Panic("Delta RGBA function failed self-test: Results differ from the expected results");
+    }
+  }
+  
+  /* Use SSSE3 deinterlace functions? */
+  if(config.cpu_extensions && sseversion >= 35) {
+    fptr_deinterlace_4field_rgba = &ssse3_deinterlace_4field_rgba;
+    fptr_deinterlace_4field_bgra = &ssse3_deinterlace_4field_bgra;
+    fptr_deinterlace_4field_argb = &ssse3_deinterlace_4field_argb;
+    fptr_deinterlace_4field_abgr = &ssse3_deinterlace_4field_abgr;
+    fptr_deinterlace_4field_gray8 = &ssse3_deinterlace_4field_gray8;
+    Debug(4,"Deinterlace: Using SSSE3 delta functions");
+  } else {
+    fptr_deinterlace_4field_rgba = &std_deinterlace_4field_rgba;
+    fptr_deinterlace_4field_bgra = &std_deinterlace_4field_bgra;
+    fptr_deinterlace_4field_argb = &std_deinterlace_4field_argb;
+    fptr_deinterlace_4field_abgr = &std_deinterlace_4field_abgr;
+    fptr_deinterlace_4field_gray8 = &std_deinterlace_4field_gray8;
+    Debug(4,"Deinterlace: Using standard delta functions");
+  }
+  
  /* Use SSE2 aligned memory copy? */
  if(config.cpu_extensions && sseversion >= 20) {
    fptr_imgbufcpy = &sse2_aligned_memcpy;
@ -3297,6 +3386,91 @@ __attribute__((noinline)) void std_fastblend(const uint8_t* col1, const uint8_t*
  }
 }

+/* FastBlend Neon for AArch32 */
+#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
+__attribute__((noinline,__target__("fpu=neon")))
+#endif
+void neon32_armv7_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent) {
+#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
+  static int8_t divider = 0;
+  static double current_blendpercent = 0.0;
+
+  if(current_blendpercent != blendpercent) {
+    /* Attempt to match the blending percent to one of the possible values */
+    if(blendpercent < 2.34375) {
+      // 1.5625% blending
+      divider = 6;
+    } else if(blendpercent >= 2.34375 && blendpercent < 4.6875) {
+      // 3.125% blending
+      divider = 5;
+    } else if(blendpercent >= 4.6875 && blendpercent < 9.375) {
+      // 6.25% blending
+      divider = 4;
+    } else if(blendpercent >= 9.375 && blendpercent < 18.75) {
+      // 12.5% blending
+      divider = 3;
+    } else if(blendpercent >= 18.75 && blendpercent < 37.5) {
+      // 25% blending
+      divider = 2;
+    } else if(blendpercent >= 37.5) {
+      // 50% blending
+      divider = 1;
+    }
+    // We only have instruction to shift left by a variable, going negative shifts right :)
+    divider *= -1;
+    current_blendpercent = blendpercent;
+  }
+
+  /* Q0(D0,D1)    = col1+0 */
+  /* Q1(D2,D3)    = col1+16 */
+  /* Q2(D4,D5)    = col1+32 */
+  /* Q3(D6,D7)    = col1+48 */
+  /* Q4(D8,D9)    = col2+0 */
+  /* Q5(D10,D11)  = col2+16 */
+  /* Q6(D12,D13)  = col2+32 */
+  /* Q7(D14,D15)  = col2+48 */
+  /* Q8(D16,D17)  = col1tmp+0 */
+  /* Q9(D18,D19)  = col1tmp+16 */
+  /* Q10(D20,D21) = col1tmp+32 */
+  /* Q11(D22,D23) = col1tmp+48 */
+  /* Q12(D24,D25) = divider */
+
+  __asm__ __volatile__ (
+  "mov r12, %4\n\t"
+  "vdup.8 q12, r12\n\t"
+  "neon32_armv7_fastblend_iter:\n\t"
+  "pld [%0,#256]\n\t"
+  "pld [%1,#256]\n\t"
+  "vldm %0!, {q0,q1,q2,q3}\n\t"
+  "vldm %1!, {q4,q5,q6,q7}\n\t"
+  "vrshl.u8 q8, q0, q12\n\t"
+  "vrshl.u8 q9, q1, q12\n\t"
+  "vrshl.u8 q10, q2, q12\n\t"
+  "vrshl.u8 q11, q3, q12\n\t"
+  "vrshl.u8 q4, q4, q12\n\t"
+  "vrshl.u8 q5, q5, q12\n\t"
+  "vrshl.u8 q6, q6, q12\n\t"
+  "vrshl.u8 q7, q7, q12\n\t"
+  "vsub.i8 q4, q4, q8\n\t"
+  "vsub.i8 q5, q5, q9\n\t"
+  "vsub.i8 q6, q6, q10\n\t"
+  "vsub.i8 q7, q7, q11\n\t"
+  "vadd.i8 q4, q4, q0\n\t"
+  "vadd.i8 q5, q5, q1\n\t"
+  "vadd.i8 q6, q6, q2\n\t"
+  "vadd.i8 q7, q7, q3\n\t"
+  "vstm %2!, {q4,q5,q6,q7}\n\t"
+  "subs %3, %3, #64\n\t"
+  "bne neon32_armv7_fastblend_iter\n\t"
+  :
+  : "r" (col1), "r" (col2), "r" (result), "r" (count), "r" (divider)
+  : "%r12", "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "%q8", "%q9", "%q10", "%q11", "%q12", "cc", "memory"
+  );
+#else
+  Panic("Neon function called on a non-ARM platform or Neon code is absent");
+#endif
+}
+
 __attribute__((noinline)) void std_blend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent) {
  double divide = blendpercent / 100.0;
  double opacity = 1.0 - divide;
@ -3519,6 +3693,122 @@ __attribute__((noinline)) void std_delta8_abgr(const uint8_t* col1, const uint8_
  }
 }

+/* Grayscale Neon for AArch32 */
+#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
+__attribute__((noinline,__target__("fpu=neon")))
+#endif
+void neon32_armv7_delta8_gray8(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
+#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
+
+  /* Q0(D0,D1)   = col1+0 */
+  /* Q1(D2,D3)   = col1+16 */
+  /* Q2(D4,D5)   = col1+32 */
+  /* Q3(D6,D7)   = col1+48 */
+  /* Q4(D8,D9)   = col2+0 */
+  /* Q5(D10,D11) = col2+16 */
+  /* Q6(D12,D13) = col2+32 */
+  /* Q7(D14,D15) = col2+48 */
+
+  __asm__ __volatile__ (
+  "neon32_armv7_delta8_gray8_iter:\n\t"
+  "pld [%0,#256]\n\t"
+  "pld [%1,#256]\n\t"
+  "vldm %0!, {q0,q1,q2,q3}\n\t"
+  "vldm %1!, {q4,q5,q6,q7}\n\t"
+  "vabd.u8 q0, q0, q4\n\t"
+  "vabd.u8 q1, q1, q5\n\t"
+  "vabd.u8 q2, q2, q6\n\t"
+  "vabd.u8 q3, q3, q7\n\t"
+  "vstm %2!, {q0,q1,q2,q3}\n\t"
+  "subs %3, %3, #64\n\t"
+  "bne neon32_armv7_delta8_gray8_iter\n\t"
+  :
+  : "r" (col1), "r" (col2), "r" (result), "r" (count)
+  : "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "cc", "memory"
+  );
+#else
+  Panic("Neon function called on a non-ARM platform or Neon code is absent");
+#endif
+}
+
+/* RGB32 Neon for AArch32 */
+#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
+__attribute__((noinline,__target__("fpu=neon")))
+#endif
+void neon32_armv7_delta8_rgb32(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, uint32_t multiplier) {
+#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
+
+  /* Q0(D0,D1)   = col1+0 */
+  /* Q1(D2,D3)   = col1+16 */
+  /* Q2(D4,D5)   = col1+32 */
+  /* Q3(D6,D7)   = col1+48 */
+  /* Q4(D8,D9)   = col2+0 */
+  /* Q5(D10,D11) = col2+16 */
+  /* Q6(D12,D13) = col2+32 */
+  /* Q7(D14,D15) = col2+48 */
+  /* Q8(D16,D17) = multiplier */
+
+  __asm__ __volatile__ (
+  "mov r12, %4\n\t"
+  "vdup.32 q8, r12\n\t"
+  "neon32_armv7_delta8_rgb32_iter:\n\t"
+  "pld [%0,#256]\n\t"
+  "pld [%1,#256]\n\t"
+  "vldm %0!, {q0,q1,q2,q3}\n\t"
+  "vldm %1!, {q4,q5,q6,q7}\n\t"
+  "vabd.u8 q0, q0, q4\n\t"
+  "vabd.u8 q1, q1, q5\n\t"
+  "vabd.u8 q2, q2, q6\n\t"
+  "vabd.u8 q3, q3, q7\n\t"
+  "vrshr.u8 q0, q0, #3\n\t"
+  "vrshr.u8 q1, q1, #3\n\t"
+  "vrshr.u8 q2, q2, #3\n\t"
+  "vrshr.u8 q3, q3, #3\n\t"
+  "vmul.i8 q0, q0, q8\n\t"
+  "vmul.i8 q1, q1, q8\n\t"
+  "vmul.i8 q2, q2, q8\n\t"
+  "vmul.i8 q3, q3, q8\n\t"
+  "vpadd.i8 d0, d0, d1\n\t"
+  "vpadd.i8 d2, d2, d3\n\t"
+  "vpadd.i8 d4, d4, d5\n\t"
+  "vpadd.i8 d6, d6, d7\n\t"
+  "vpadd.i8 d0, d0, d0\n\t"
+  "vpadd.i8 d1, d2, d2\n\t"
+  "vpadd.i8 d2, d4, d4\n\t"
+  "vpadd.i8 d3, d6, d6\n\t"
+  "vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [%2]!\n\t"
+  "subs %3, %3, #16\n\t"
+  "bne neon32_armv7_delta8_rgb32_iter\n\t"
+  :
+  : "r" (col1), "r" (col2), "r" (result), "r" (count), "r" (multiplier)
+  : "%r12", "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "%q8", "cc", "memory"
+  );
+}
+#else
+  Panic("Neon function called on a non-ARM platform or Neon code is absent");
+#endif
+}
+
+/* RGB32: RGBA Neon for AArch32 */
+void neon32_armv7_delta8_rgba(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
+  neon32_armv7_delta8_rgb32(col1, col2, result, count, 0x00010502);
+}
+
+/* RGB32: BGRA Neon for AArch32 */
+void neon32_armv7_delta8_bgra(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
+  neon32_armv7_delta8_rgb32(col1, col2, result, count, 0x00020501);
+}
+
+/* RGB32: ARGB Neon for AArch32 */
+void neon32_armv7_delta8_argb(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
+  neon32_armv7_delta8_rgb32(col1, col2, result, count, 0x01050200);
+}
+
+/* RGB32: ABGR Neon for AArch32 */
+void neon32_armv7_delta8_abgr(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
+  neon32_armv7_delta8_rgb32(col1, col2, result, count, 0x02050100);
+}
+
 /* Grayscale SSE2 */
 #if defined(__i386__) || defined(__x86_64__)
 __attribute__((noinline,__target__("sse2")))
--- a/src/zm_image.h
+++ b/src/zm_image.h
@ -54,7 +54,7 @@ extern imgbufcpy_fptr_t fptr_imgbufcpy;

 /* Should be called from Image class functions */
 inline static uint8_t* AllocBuffer(size_t p_bufsize) {
-	uint8_t* buffer = (uint8_t*)zm_mallocaligned(16,p_bufsize);
+	uint8_t* buffer = (uint8_t*)zm_mallocaligned(64,p_bufsize);
 	if(buffer == NULL)
 		Fatal("Memory allocation failed: %s",strerror(errno));
 	
@ -264,6 +264,7 @@ public:
 /* Blend functions */
 void sse2_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent);
 void std_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent);
+void neon32_armv7_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent);
 void std_blend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent);

 /* Delta functions */
@ -274,6 +275,11 @@ void std_delta8_rgba(const uint8_t* col1, const uint8_t* col2, uint8_t* result,
 void std_delta8_bgra(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
 void std_delta8_argb(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
 void std_delta8_abgr(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
+void neon32_armv7_delta8_gray8(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
+void neon32_armv7_delta8_rgba(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
+void neon32_armv7_delta8_bgra(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
+void neon32_armv7_delta8_argb(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
+void neon32_armv7_delta8_abgr(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
 void sse2_delta8_gray8(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
 void sse2_delta8_rgba(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
 void sse2_delta8_bgra(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
--- a/src/zm_libvlc_camera.cpp
+++ b/src/zm_libvlc_camera.cpp
@ -182,8 +182,8 @@ int LibvlcCamera::PrimeCapture()

  mLibvlcData.bufferSize = width * height * mBpp;
  // Libvlc wants 32 byte alignment for images (should in theory do this for all image lines)
-  mLibvlcData.buffer = (uint8_t*)zm_mallocaligned(32, mLibvlcData.bufferSize);
-  mLibvlcData.prevBuffer = (uint8_t*)zm_mallocaligned(32, mLibvlcData.bufferSize);
+  mLibvlcData.buffer = (uint8_t*)zm_mallocaligned(64, mLibvlcData.bufferSize);
+  mLibvlcData.prevBuffer = (uint8_t*)zm_mallocaligned(64, mLibvlcData.bufferSize);
  
  mLibvlcData.newImage.setValueImmediate(false);

--- a/src/zm_monitor.cpp
+++ b/src/zm_monitor.cpp
@ -393,7 +393,7 @@ Monitor::Monitor(
       + sizeof(TriggerData)
       + (image_buffer_count*sizeof(struct timeval))
       + (image_buffer_count*camera->ImageSize())
-       + 64; /* Padding used to permit aligning the images buffer to 16 byte boundary */
+       + 64; /* Padding used to permit aligning the images buffer to 64 byte boundary */

  Debug( 1, "mem.size=%d", mem_size );
  mem_ptr = NULL;
@ -569,10 +569,10 @@ bool Monitor::connect() {
  struct timeval *shared_timestamps = (struct timeval *)((char *)trigger_data + sizeof(TriggerData));
  unsigned char *shared_images = (unsigned char *)((char *)shared_timestamps + (image_buffer_count*sizeof(struct timeval)));
  
-  if(((unsigned long)shared_images % 16) != 0) {
-    /* Align images buffer to nearest 16 byte boundary */
-    Debug(3,"Aligning shared memory images to the next 16 byte boundary");
-    shared_images = (uint8_t*)((unsigned long)shared_images + (16 - ((unsigned long)shared_images % 16)));
+  if(((unsigned long)shared_images % 64) != 0) {
+    /* Align images buffer to nearest 64 byte boundary */
+    Debug(3,"Aligning shared memory images to the next 64 byte boundary");
+    shared_images = (uint8_t*)((unsigned long)shared_images + (64 - ((unsigned long)shared_images % 64)));
  }
  image_buffer = new Snapshot[image_buffer_count];
  for ( int i = 0; i < image_buffer_count; i++ )
--- a/src/zm_utils.cpp
+++ b/src/zm_utils.cpp
@ -24,12 +24,16 @@
 #include <string.h>
 #include <stdio.h>
 #include <stdarg.h>
+#if defined(__arm__)
+#include <sys/auxv.h>
+#endif

 #ifdef HAVE_CURL_CURL_H
 #include <curl/curl.h>
 #endif

 unsigned int sseversion = 0;
+unsigned int neonversion = 0;

 std::string trimSet(std::string str, std::string trimset) {
  // Trim Both leading and trailing sets
@ -238,30 +242,59 @@ int pairsplit(const char* string, const char delim, std::string& name, std::stri
  return 0;
 }

-/* Sets sse_version  */
-void ssedetect() {
+/* Detect special hardware features, such as SIMD instruction sets */
+void hwcaps_detect() {
+  neonversion = 0;
+  sseversion = 0;
 #if (defined(__i386__) || defined(__x86_64__))
  /* x86 or x86-64 processor */
-  uint32_t r_edx, r_ecx;
-  
+  uint32_t r_edx, r_ecx, r_ebx;
+
+#ifdef __x86_64__
  __asm__ __volatile__(
-#if defined(__i386__)
-    "pushl %%ebx;\n\t"
-#endif
+  "push %%rbx\n\t"
+  "mov $0x0,%%ecx\n\t"
+  "mov $0x7,%%eax\n\t"
+  "cpuid\n\t"
+  "push %%rbx\n\t"
  "mov $0x1,%%eax\n\t"
  "cpuid\n\t"
-#if defined(__i386__)
-    "popl %%ebx;\n\t"
-#endif
-  : "=d" (r_edx), "=c" (r_ecx)
+  "pop %%rax\n\t"
+  "pop %%rbx\n\t"
+  : "=d" (r_edx), "=c" (r_ecx), "=a" (r_ebx)
+  :
  :
-  : "%eax"
-#if !defined(__i386__)
-       , "%ebx"
-#endif
  );
-  
-  if (r_ecx & 0x00000200) {
+#else
+  __asm__ __volatile__(
+  "push %%ebx\n\t"
+  "mov $0x0,%%ecx\n\t"
+  "mov $0x7,%%eax\n\t"
+  "cpuid\n\t"
+  "push %%ebx\n\t"
+  "mov $0x1,%%eax\n\t"
+  "cpuid\n\t"
+  "pop %%eax\n\t"
+  "pop %%ebx\n\t"
+  : "=d" (r_edx), "=c" (r_ecx), "=a" (r_ebx)
+  :
+  :
+  );
+#endif
+
+  if (r_ebx & 0x00000020) {
+    sseversion = 52; /* AVX2 */
+    Debug(1,"Detected a x86\\x86-64 processor with AVX2");
+  } else if (r_ecx & 0x10000000) {
+    sseversion = 51; /* AVX */
+    Debug(1,"Detected a x86\\x86-64 processor with AVX");
+  } else if (r_ecx & 0x00100000) {
+    sseversion = 42; /* SSE4.2 */
+    Debug(1,"Detected a x86\\x86-64 processor with SSE4.2");
+  } else if (r_ecx & 0x00080000) {
+    sseversion = 41; /* SSE4.1 */
+    Debug(1,"Detected a x86\\x86-64 processor with SSE4.1");
+  } else if (r_ecx & 0x00000200) {
    sseversion = 35; /* SSSE3 */
    Debug(1,"Detected a x86\\x86-64 processor with SSSE3");
  } else if (r_ecx & 0x00000001) {
@ -276,12 +309,20 @@ void ssedetect() {
  } else {
    sseversion = 0;
    Debug(1,"Detected a x86\\x86-64 processor");
+  } 
+#elif defined(__arm__)
+  // ARM processor
+  // To see if it supports NEON, we need to get that information from the kernel
+  unsigned long auxval = getauxval(AT_HWCAP);
+  if (auxval & HWCAP_ARM_NEON) {
+    Debug(1,"Detected ARM processor with Neon");
+    neonversion = 1;
+  } else {
+    Debug(1,"Detected ARM processor");
  }
-  
 #else
-  /* Non x86 or x86-64 processor, SSE2 is not available */
-  Debug(1,"Detected a non x86\\x86-64 processor");
-  sseversion = 0;
+  // Unknown processor
+  Debug(1,"Detected unknown processor architecture");
 #endif
 }

--- a/src/zm_utils.h
+++ b/src/zm_utils.h
@ -54,11 +54,12 @@ inline int min( int a, int b )
  return( a<=b?a:b );
 }

-void ssedetect();
 void* sse2_aligned_memcpy(void* dest, const void* src, size_t bytes);
 void timespec_diff(struct timespec *start, struct timespec *end, struct timespec *diff);

+void hwcaps_detect();
 extern unsigned int sseversion;
+extern unsigned int neonversion;

 std::string UriDecode( const std::string &encoded );

--- a/src/zma.cpp
+++ b/src/zma.cpp
@ -133,7 +133,7 @@ int main( int argc, char *argv[] )

  logInit( log_id_string );
  
-  ssedetect();
+  hwcaps_detect();

  Monitor *monitor = Monitor::Load( id, true, Monitor::ANALYSIS );

--- a/src/zmc.cpp
+++ b/src/zmc.cpp
@ -206,7 +206,7 @@ int main( int argc, char *argv[] )

  logInit( log_id_string );
  
-  ssedetect();
+  hwcaps_detect();

  Monitor **monitors = 0;
  int n_monitors = 0;
--- a/src/zms.cpp
+++ b/src/zms.cpp
@ -91,7 +91,7 @@ int main( int argc, const char *argv[] )

  logInit( "zms" );
  
-  ssedetect();
+  hwcaps_detect();

  zmSetDefaultTermHandler();
  zmSetDefaultDieHandler();