Merge branch 'mastertheknife-armv7_neon'
This commit is contained in:
commit
021c136640
|
@ -66,6 +66,24 @@ set(CMAKE_CXX_FLAGS_DEBUG "-Wall -D__STDC_CONSTANT_MACROS -g")
|
|||
set(CMAKE_INCLUDE_CURRENT_DIR ON)
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
|
||||
|
||||
# GCC below 6.0 doesn't support __target__("fpu=neon") attribute, required for compiling ARM Neon code, otherwise compilation fails.
|
||||
# Must use -mfpu=neon compiler flag instead, but only do that for processors that support neon, otherwise strip the neon code alltogether,
|
||||
# because passing -fmpu=neon is unsafe to processors that don't support neon
|
||||
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
IF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
|
||||
EXEC_PROGRAM(grep ARGS " neon " "/proc/cpuinfo" OUTPUT_VARIABLE neonoutput RETURN_VALUE neonresult)
|
||||
IF(neonresult EQUAL 0)
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -mfpu=neon")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfpu=neon")
|
||||
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mfpu=neon")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mfpu=neon")
|
||||
ELSE(neonresult EQUAL 0)
|
||||
add_definitions(-DZM_STRIP_NEON=1)
|
||||
message(STATUS "ARM Neon is not available on this processor. Neon functions will be absent")
|
||||
ENDIF(neonresult EQUAL 0)
|
||||
ENDIF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
|
||||
ENDIF(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
|
||||
# Modules that we need:
|
||||
include (GNUInstallDirs)
|
||||
include (CheckIncludeFile)
|
||||
|
|
|
@ -39,10 +39,10 @@ Camera::Camera( int p_id, SourceType p_type, int p_width, int p_height, int p_co
|
|||
Debug(2,"New camera id: %d width: %d height: %d colours: %d subpixelorder: %d capture: %d",id,width,height,colours,subpixelorder,capture);
|
||||
|
||||
/* Because many loops are unrolled and work on 16 colours/time or 4 pixels/time, we have to meet requirements */
|
||||
if((colours == ZM_COLOUR_GRAY8 || colours == ZM_COLOUR_RGB32) && (imagesize % 16) != 0) {
|
||||
Fatal("Image size is not multiples of 16");
|
||||
} else if(colours == ZM_COLOUR_RGB24 && ((imagesize % 16) != 0 || (imagesize % 12) != 0)) {
|
||||
Fatal("Image size is not multiples of 12 and 16");
|
||||
if((colours == ZM_COLOUR_GRAY8 || colours == ZM_COLOUR_RGB32) && (imagesize % 64) != 0) {
|
||||
Fatal("Image size is not multiples of 64");
|
||||
} else if(colours == ZM_COLOUR_RGB24 && ((imagesize % 64) != 0 || (imagesize % 12) != 0)) {
|
||||
Fatal("Image size is not multiples of 12 and 64");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
306
src/zm_image.cpp
306
src/zm_image.cpp
|
@ -195,6 +195,9 @@ void Image::Initialise()
|
|||
if(config.cpu_extensions && sseversion >= 20) {
|
||||
fptr_blend = &sse2_fastblend; /* SSE2 fast blend */
|
||||
Debug(4,"Blend: Using SSE2 fast blend function");
|
||||
} else if(config.cpu_extensions && neonversion >= 1) {
|
||||
fptr_blend = &neon32_armv7_fastblend; /* ARM Neon fast blend */
|
||||
Debug(4,"Blend: Using ARM Neon fast blend function");
|
||||
} else {
|
||||
fptr_blend = &std_fastblend; /* standard fast blend */
|
||||
Debug(4,"Blend: Using fast blend function");
|
||||
|
@ -204,15 +207,31 @@ void Image::Initialise()
|
|||
Debug(4,"Blend: Using standard blend function");
|
||||
}
|
||||
|
||||
__attribute__((aligned(16))) uint8_t blend1[16] = {142,255,159,91,88,227,0,52,37,80,152,97,104,252,90,82};
|
||||
__attribute__((aligned(16))) uint8_t blend2[16] = {129,56,136,96,119,149,94,29,96,176,1,144,230,203,111,172};
|
||||
__attribute__((aligned(16))) uint8_t blendres[16];
|
||||
__attribute__((aligned(16))) uint8_t blendexp[16] = {141,231,157,92,91,217,11,49,45,92,133,103,119,246,92,93}; /* Expected results for 12.5% blend */
|
||||
|
||||
(*fptr_blend)(blend1,blend2,blendres,16,12.5);
|
||||
|
||||
__attribute__((aligned(64))) uint8_t blend1[128] = {
|
||||
86,58,54,63,149,62,209,34,148,46,186,176,9,236,193,254,113,146,228,220,123,164,92,98,9,72,67,156,63,118,96,167,
|
||||
48,224,106,176,201,245,223,219,198,50,100,31,68,77,33,76,166,90,254,128,191,82,84,32,3,171,147,248,14,196,141,179,
|
||||
79,237,121,11,132,37,194,225,45,171,169,167,56,64,193,85,147,33,97,221,94,97,90,44,191,248,65,8,17,240,167,207,
|
||||
224,23,71,74,81,1,46,110,227,94,163,170,55,155,52,147,224,154,237,35,255,26,229,11,223,242,118,155,82,37,189,2
|
||||
};
|
||||
__attribute__((aligned(64))) uint8_t blend2[128] = {
|
||||
92,188,203,118,121,231,252,218,126,88,80,72,123,16,91,131,109,0,57,56,95,204,74,8,137,94,6,69,18,146,229,194,
|
||||
146,230,13,146,95,48,185,65,162,47,152,172,184,111,245,143,247,105,49,42,89,37,145,255,221,200,103,80,98,39,14,227,
|
||||
227,46,46,59,248,7,83,20,157,79,36,161,237,55,77,175,232,200,38,170,198,239,89,19,82,88,130,120,203,184,141,117,
|
||||
228,140,150,107,103,195,74,130,42,11,150,70,176,204,198,188,38,252,174,104,128,106,31,17,141,231,62,104,179,29,143,130
|
||||
};
|
||||
__attribute__((aligned(64))) uint8_t blendexp[128] = {
|
||||
86,73,71,69,145,82,214,56,145,51,173,163,22,209,180,239,112,128,207,200,119,168,89,87,24,74,59,145,57,121,111,170,
|
||||
59,224,94,172,188,221,218,200,193,49,106,47,81,81,58,84,175,91,229,117,178,76,91,58,29,174,141,227,24,177,125,184,
|
||||
96,214,112,16,145,33,180,200,58,159,153,166,77,62,179,95,157,53,89,214,106,114,89,41,177,228,72,21,39,233,163,196,
|
||||
224,37,80,77,83,24,49,112,204,84,161,158,69,160,69,151,201,165,229,43,239,35,205,11,213,240,111,148,93,36,183,17
|
||||
};
|
||||
__attribute__((aligned(64))) uint8_t blendres[128];
|
||||
|
||||
/* Run the blend function */
|
||||
(*fptr_blend)(blend1,blend2,blendres,128,12.0);
|
||||
|
||||
/* Compare results with expected results */
|
||||
for(int i=0;i<16;i++) {
|
||||
for(int i=0;i<128;i++) {
|
||||
if(abs(blendexp[i] - blendres[i]) > 3) {
|
||||
Panic("Blend function failed self-test: Results differ from the expected results");
|
||||
}
|
||||
|
@ -248,6 +267,14 @@ void Image::Initialise()
|
|||
// fptr_delta8_abgr = &std_delta8_abgr;
|
||||
fptr_delta8_gray8 = &sse2_delta8_gray8;
|
||||
Debug(4,"Delta: Using SSE2 delta functions");
|
||||
} else if(neonversion >= 1) {
|
||||
/* ARM Neon available */
|
||||
fptr_delta8_rgba = &neon32_armv7_delta8_rgba;
|
||||
fptr_delta8_bgra = &neon32_armv7_delta8_bgra;
|
||||
fptr_delta8_argb = &neon32_armv7_delta8_argb;
|
||||
fptr_delta8_abgr = &neon32_armv7_delta8_abgr;
|
||||
fptr_delta8_gray8 = &neon32_armv7_delta8_gray8;
|
||||
Debug(4,"Delta: Using ARM Neon delta functions");
|
||||
} else {
|
||||
/* No suitable SSE version available */
|
||||
fptr_delta8_rgba = &std_delta8_rgba;
|
||||
|
@ -280,6 +307,68 @@ void Image::Initialise()
|
|||
Debug(4,"Deinterlace: Using standard functions");
|
||||
|
||||
#if defined(__i386__) && !defined(__x86_64__)
|
||||
|
||||
__attribute__((aligned(64))) uint8_t delta8_1[128] = {
|
||||
221,22,234,254,8,140,15,28,166,13,203,56,92,250,79,225,19,59,241,145,253,33,87,204,97,168,229,180,3,108,205,177,
|
||||
41,108,65,149,4,87,16,240,56,50,135,64,153,3,219,214,239,55,169,180,167,45,243,56,191,119,145,250,102,145,73,32,
|
||||
207,213,189,167,147,83,217,30,113,51,142,125,219,97,60,5,135,195,95,133,21,197,150,82,134,93,198,97,97,49,117,24,
|
||||
242,253,242,5,190,71,182,1,0,69,25,181,139,84,242,79,150,158,29,215,98,100,245,16,86,165,18,98,46,100,139,19
|
||||
};
|
||||
__attribute__((aligned(64))) uint8_t delta8_2[128] = {
|
||||
236,22,153,161,50,141,15,130,89,251,33,5,140,201,225,194,138,76,248,89,25,26,29,93,250,251,48,157,41,126,140,152,
|
||||
170,177,134,14,234,99,3,105,217,76,38,233,89,30,93,48,234,40,202,80,184,4,250,71,183,249,76,78,184,148,185,120,
|
||||
137,214,238,57,50,93,29,60,99,207,40,15,43,28,177,118,60,231,90,47,198,251,250,241,212,114,249,17,95,161,216,218,
|
||||
51,178,137,161,213,108,35,72,65,24,5,176,110,15,0,2,137,58,0,133,197,1,122,169,175,33,223,138,37,114,52,186
|
||||
};
|
||||
__attribute__((aligned(64))) uint8_t delta8_gray8_exp[128] = {
|
||||
15,0,81,93,42,1,0,102,77,238,170,51,48,49,146,31,119,17,7,56,228,7,58,111,153,83,181,23,38,18,65,25,
|
||||
129,69,69,135,230,12,13,135,161,26,97,169,64,27,126,166,5,15,33,100,17,41,7,15,8,130,69,172,82,3,112,88,
|
||||
70,1,49,110,97,10,188,30,14,156,102,110,176,69,117,113,75,36,5,86,177,54,100,159,78,21,51,80,2,112,99,194,
|
||||
191,75,105,156,23,37,147,71,65,45,20,5,29,69,242,77,13,100,29,82,99,99,123,153,89,132,205,40,9,14,87,167
|
||||
};
|
||||
__attribute__((aligned(64))) uint8_t delta8_rgba_exp[32] = {
|
||||
73,25,148,105,20,64,129,49,85,43,106,123,47,13,102,92,58,126,110,110,29,109,54,124,114,114,19,179,51,127,154,97
|
||||
};
|
||||
__attribute__((aligned(64))) uint8_t delta8_gray8_res[128];
|
||||
__attribute__((aligned(64))) uint8_t delta8_rgba_res[128];
|
||||
|
||||
/* Run the delta8 grayscale function */
|
||||
(*fptr_delta8_gray8)(delta8_1,delta8_2,delta8_gray8_exp,128);
|
||||
|
||||
/* Compare results with expected results */
|
||||
for(int i=0;i<128;i++) {
|
||||
if(abs(delta8_gray8_exp[i] - delta8_gray8_res[i]) > 7) {
|
||||
Panic("Delta grayscale function failed self-test: Results differ from the expected results");
|
||||
}
|
||||
}
|
||||
|
||||
/* Run the delta8 RGBA function */
|
||||
(*fptr_delta8_rgba)(delta8_1,delta8_2,delta8_rgba_exp,32);
|
||||
|
||||
/* Compare results with expected results */
|
||||
for(int i=0;i<32;i++) {
|
||||
if(abs(delta8_rgba_exp[i] - delta8_rgba_res[i]) > 7) {
|
||||
Panic("Delta RGBA function failed self-test: Results differ from the expected results");
|
||||
}
|
||||
}
|
||||
|
||||
/* Use SSSE3 deinterlace functions? */
|
||||
if(config.cpu_extensions && sseversion >= 35) {
|
||||
fptr_deinterlace_4field_rgba = &ssse3_deinterlace_4field_rgba;
|
||||
fptr_deinterlace_4field_bgra = &ssse3_deinterlace_4field_bgra;
|
||||
fptr_deinterlace_4field_argb = &ssse3_deinterlace_4field_argb;
|
||||
fptr_deinterlace_4field_abgr = &ssse3_deinterlace_4field_abgr;
|
||||
fptr_deinterlace_4field_gray8 = &ssse3_deinterlace_4field_gray8;
|
||||
Debug(4,"Deinterlace: Using SSSE3 delta functions");
|
||||
} else {
|
||||
fptr_deinterlace_4field_rgba = &std_deinterlace_4field_rgba;
|
||||
fptr_deinterlace_4field_bgra = &std_deinterlace_4field_bgra;
|
||||
fptr_deinterlace_4field_argb = &std_deinterlace_4field_argb;
|
||||
fptr_deinterlace_4field_abgr = &std_deinterlace_4field_abgr;
|
||||
fptr_deinterlace_4field_gray8 = &std_deinterlace_4field_gray8;
|
||||
Debug(4,"Deinterlace: Using standard delta functions");
|
||||
}
|
||||
|
||||
/* Use SSE2 aligned memory copy? */
|
||||
if(config.cpu_extensions && sseversion >= 20) {
|
||||
fptr_imgbufcpy = &sse2_aligned_memcpy;
|
||||
|
@ -3297,6 +3386,91 @@ __attribute__((noinline)) void std_fastblend(const uint8_t* col1, const uint8_t*
|
|||
}
|
||||
}
|
||||
|
||||
/* FastBlend Neon for AArch32 */
|
||||
#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
|
||||
__attribute__((noinline,__target__("fpu=neon")))
|
||||
#endif
|
||||
void neon32_armv7_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent) {
|
||||
#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
|
||||
static int8_t divider = 0;
|
||||
static double current_blendpercent = 0.0;
|
||||
|
||||
if(current_blendpercent != blendpercent) {
|
||||
/* Attempt to match the blending percent to one of the possible values */
|
||||
if(blendpercent < 2.34375) {
|
||||
// 1.5625% blending
|
||||
divider = 6;
|
||||
} else if(blendpercent >= 2.34375 && blendpercent < 4.6875) {
|
||||
// 3.125% blending
|
||||
divider = 5;
|
||||
} else if(blendpercent >= 4.6875 && blendpercent < 9.375) {
|
||||
// 6.25% blending
|
||||
divider = 4;
|
||||
} else if(blendpercent >= 9.375 && blendpercent < 18.75) {
|
||||
// 12.5% blending
|
||||
divider = 3;
|
||||
} else if(blendpercent >= 18.75 && blendpercent < 37.5) {
|
||||
// 25% blending
|
||||
divider = 2;
|
||||
} else if(blendpercent >= 37.5) {
|
||||
// 50% blending
|
||||
divider = 1;
|
||||
}
|
||||
// We only have instruction to shift left by a variable, going negative shifts right :)
|
||||
divider *= -1;
|
||||
current_blendpercent = blendpercent;
|
||||
}
|
||||
|
||||
/* Q0(D0,D1) = col1+0 */
|
||||
/* Q1(D2,D3) = col1+16 */
|
||||
/* Q2(D4,D5) = col1+32 */
|
||||
/* Q3(D6,D7) = col1+48 */
|
||||
/* Q4(D8,D9) = col2+0 */
|
||||
/* Q5(D10,D11) = col2+16 */
|
||||
/* Q6(D12,D13) = col2+32 */
|
||||
/* Q7(D14,D15) = col2+48 */
|
||||
/* Q8(D16,D17) = col1tmp+0 */
|
||||
/* Q9(D18,D19) = col1tmp+16 */
|
||||
/* Q10(D20,D21) = col1tmp+32 */
|
||||
/* Q11(D22,D23) = col1tmp+48 */
|
||||
/* Q12(D24,D25) = divider */
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"mov r12, %4\n\t"
|
||||
"vdup.8 q12, r12\n\t"
|
||||
"neon32_armv7_fastblend_iter:\n\t"
|
||||
"pld [%0,#256]\n\t"
|
||||
"pld [%1,#256]\n\t"
|
||||
"vldm %0!, {q0,q1,q2,q3}\n\t"
|
||||
"vldm %1!, {q4,q5,q6,q7}\n\t"
|
||||
"vrshl.u8 q8, q0, q12\n\t"
|
||||
"vrshl.u8 q9, q1, q12\n\t"
|
||||
"vrshl.u8 q10, q2, q12\n\t"
|
||||
"vrshl.u8 q11, q3, q12\n\t"
|
||||
"vrshl.u8 q4, q4, q12\n\t"
|
||||
"vrshl.u8 q5, q5, q12\n\t"
|
||||
"vrshl.u8 q6, q6, q12\n\t"
|
||||
"vrshl.u8 q7, q7, q12\n\t"
|
||||
"vsub.i8 q4, q4, q8\n\t"
|
||||
"vsub.i8 q5, q5, q9\n\t"
|
||||
"vsub.i8 q6, q6, q10\n\t"
|
||||
"vsub.i8 q7, q7, q11\n\t"
|
||||
"vadd.i8 q4, q4, q0\n\t"
|
||||
"vadd.i8 q5, q5, q1\n\t"
|
||||
"vadd.i8 q6, q6, q2\n\t"
|
||||
"vadd.i8 q7, q7, q3\n\t"
|
||||
"vstm %2!, {q4,q5,q6,q7}\n\t"
|
||||
"subs %3, %3, #64\n\t"
|
||||
"bne neon32_armv7_fastblend_iter\n\t"
|
||||
:
|
||||
: "r" (col1), "r" (col2), "r" (result), "r" (count), "r" (divider)
|
||||
: "%r12", "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "%q8", "%q9", "%q10", "%q11", "%q12", "cc", "memory"
|
||||
);
|
||||
#else
|
||||
Panic("Neon function called on a non-ARM platform or Neon code is absent");
|
||||
#endif
|
||||
}
|
||||
|
||||
__attribute__((noinline)) void std_blend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent) {
|
||||
double divide = blendpercent / 100.0;
|
||||
double opacity = 1.0 - divide;
|
||||
|
@ -3519,6 +3693,122 @@ __attribute__((noinline)) void std_delta8_abgr(const uint8_t* col1, const uint8_
|
|||
}
|
||||
}
|
||||
|
||||
/* Grayscale Neon for AArch32 */
|
||||
#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
|
||||
__attribute__((noinline,__target__("fpu=neon")))
|
||||
#endif
|
||||
void neon32_armv7_delta8_gray8(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
|
||||
#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
|
||||
|
||||
/* Q0(D0,D1) = col1+0 */
|
||||
/* Q1(D2,D3) = col1+16 */
|
||||
/* Q2(D4,D5) = col1+32 */
|
||||
/* Q3(D6,D7) = col1+48 */
|
||||
/* Q4(D8,D9) = col2+0 */
|
||||
/* Q5(D10,D11) = col2+16 */
|
||||
/* Q6(D12,D13) = col2+32 */
|
||||
/* Q7(D14,D15) = col2+48 */
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"neon32_armv7_delta8_gray8_iter:\n\t"
|
||||
"pld [%0,#256]\n\t"
|
||||
"pld [%1,#256]\n\t"
|
||||
"vldm %0!, {q0,q1,q2,q3}\n\t"
|
||||
"vldm %1!, {q4,q5,q6,q7}\n\t"
|
||||
"vabd.u8 q0, q0, q4\n\t"
|
||||
"vabd.u8 q1, q1, q5\n\t"
|
||||
"vabd.u8 q2, q2, q6\n\t"
|
||||
"vabd.u8 q3, q3, q7\n\t"
|
||||
"vstm %2!, {q0,q1,q2,q3}\n\t"
|
||||
"subs %3, %3, #64\n\t"
|
||||
"bne neon32_armv7_delta8_gray8_iter\n\t"
|
||||
:
|
||||
: "r" (col1), "r" (col2), "r" (result), "r" (count)
|
||||
: "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "cc", "memory"
|
||||
);
|
||||
#else
|
||||
Panic("Neon function called on a non-ARM platform or Neon code is absent");
|
||||
#endif
|
||||
}
|
||||
|
||||
/* RGB32 Neon for AArch32 */
|
||||
#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
|
||||
__attribute__((noinline,__target__("fpu=neon")))
|
||||
#endif
|
||||
void neon32_armv7_delta8_rgb32(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, uint32_t multiplier) {
|
||||
#if (defined(__arm__) && !defined(ZM_STRIP_NEON))
|
||||
|
||||
/* Q0(D0,D1) = col1+0 */
|
||||
/* Q1(D2,D3) = col1+16 */
|
||||
/* Q2(D4,D5) = col1+32 */
|
||||
/* Q3(D6,D7) = col1+48 */
|
||||
/* Q4(D8,D9) = col2+0 */
|
||||
/* Q5(D10,D11) = col2+16 */
|
||||
/* Q6(D12,D13) = col2+32 */
|
||||
/* Q7(D14,D15) = col2+48 */
|
||||
/* Q8(D16,D17) = multiplier */
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"mov r12, %4\n\t"
|
||||
"vdup.32 q8, r12\n\t"
|
||||
"neon32_armv7_delta8_rgb32_iter:\n\t"
|
||||
"pld [%0,#256]\n\t"
|
||||
"pld [%1,#256]\n\t"
|
||||
"vldm %0!, {q0,q1,q2,q3}\n\t"
|
||||
"vldm %1!, {q4,q5,q6,q7}\n\t"
|
||||
"vabd.u8 q0, q0, q4\n\t"
|
||||
"vabd.u8 q1, q1, q5\n\t"
|
||||
"vabd.u8 q2, q2, q6\n\t"
|
||||
"vabd.u8 q3, q3, q7\n\t"
|
||||
"vrshr.u8 q0, q0, #3\n\t"
|
||||
"vrshr.u8 q1, q1, #3\n\t"
|
||||
"vrshr.u8 q2, q2, #3\n\t"
|
||||
"vrshr.u8 q3, q3, #3\n\t"
|
||||
"vmul.i8 q0, q0, q8\n\t"
|
||||
"vmul.i8 q1, q1, q8\n\t"
|
||||
"vmul.i8 q2, q2, q8\n\t"
|
||||
"vmul.i8 q3, q3, q8\n\t"
|
||||
"vpadd.i8 d0, d0, d1\n\t"
|
||||
"vpadd.i8 d2, d2, d3\n\t"
|
||||
"vpadd.i8 d4, d4, d5\n\t"
|
||||
"vpadd.i8 d6, d6, d7\n\t"
|
||||
"vpadd.i8 d0, d0, d0\n\t"
|
||||
"vpadd.i8 d1, d2, d2\n\t"
|
||||
"vpadd.i8 d2, d4, d4\n\t"
|
||||
"vpadd.i8 d3, d6, d6\n\t"
|
||||
"vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [%2]!\n\t"
|
||||
"subs %3, %3, #16\n\t"
|
||||
"bne neon32_armv7_delta8_rgb32_iter\n\t"
|
||||
:
|
||||
: "r" (col1), "r" (col2), "r" (result), "r" (count), "r" (multiplier)
|
||||
: "%r12", "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", "%q8", "cc", "memory"
|
||||
);
|
||||
}
|
||||
#else
|
||||
Panic("Neon function called on a non-ARM platform or Neon code is absent");
|
||||
#endif
|
||||
}
|
||||
|
||||
/* RGB32: RGBA Neon for AArch32 */
|
||||
void neon32_armv7_delta8_rgba(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
|
||||
neon32_armv7_delta8_rgb32(col1, col2, result, count, 0x00010502);
|
||||
}
|
||||
|
||||
/* RGB32: BGRA Neon for AArch32 */
|
||||
void neon32_armv7_delta8_bgra(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
|
||||
neon32_armv7_delta8_rgb32(col1, col2, result, count, 0x00020501);
|
||||
}
|
||||
|
||||
/* RGB32: ARGB Neon for AArch32 */
|
||||
void neon32_armv7_delta8_argb(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
|
||||
neon32_armv7_delta8_rgb32(col1, col2, result, count, 0x01050200);
|
||||
}
|
||||
|
||||
/* RGB32: ABGR Neon for AArch32 */
|
||||
void neon32_armv7_delta8_abgr(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count) {
|
||||
neon32_armv7_delta8_rgb32(col1, col2, result, count, 0x02050100);
|
||||
}
|
||||
|
||||
/* Grayscale SSE2 */
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
__attribute__((noinline,__target__("sse2")))
|
||||
|
|
|
@ -54,7 +54,7 @@ extern imgbufcpy_fptr_t fptr_imgbufcpy;
|
|||
|
||||
/* Should be called from Image class functions */
|
||||
inline static uint8_t* AllocBuffer(size_t p_bufsize) {
|
||||
uint8_t* buffer = (uint8_t*)zm_mallocaligned(16,p_bufsize);
|
||||
uint8_t* buffer = (uint8_t*)zm_mallocaligned(64,p_bufsize);
|
||||
if(buffer == NULL)
|
||||
Fatal("Memory allocation failed: %s",strerror(errno));
|
||||
|
||||
|
@ -264,6 +264,7 @@ public:
|
|||
/* Blend functions */
|
||||
void sse2_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent);
|
||||
void std_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent);
|
||||
void neon32_armv7_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent);
|
||||
void std_blend(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count, double blendpercent);
|
||||
|
||||
/* Delta functions */
|
||||
|
@ -274,6 +275,11 @@ void std_delta8_rgba(const uint8_t* col1, const uint8_t* col2, uint8_t* result,
|
|||
void std_delta8_bgra(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void std_delta8_argb(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void std_delta8_abgr(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void neon32_armv7_delta8_gray8(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void neon32_armv7_delta8_rgba(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void neon32_armv7_delta8_bgra(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void neon32_armv7_delta8_argb(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void neon32_armv7_delta8_abgr(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void sse2_delta8_gray8(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void sse2_delta8_rgba(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
void sse2_delta8_bgra(const uint8_t* col1, const uint8_t* col2, uint8_t* result, unsigned long count);
|
||||
|
|
|
@ -182,8 +182,8 @@ int LibvlcCamera::PrimeCapture()
|
|||
|
||||
mLibvlcData.bufferSize = width * height * mBpp;
|
||||
// Libvlc wants 32 byte alignment for images (should in theory do this for all image lines)
|
||||
mLibvlcData.buffer = (uint8_t*)zm_mallocaligned(32, mLibvlcData.bufferSize);
|
||||
mLibvlcData.prevBuffer = (uint8_t*)zm_mallocaligned(32, mLibvlcData.bufferSize);
|
||||
mLibvlcData.buffer = (uint8_t*)zm_mallocaligned(64, mLibvlcData.bufferSize);
|
||||
mLibvlcData.prevBuffer = (uint8_t*)zm_mallocaligned(64, mLibvlcData.bufferSize);
|
||||
|
||||
mLibvlcData.newImage.setValueImmediate(false);
|
||||
|
||||
|
|
|
@ -393,7 +393,7 @@ Monitor::Monitor(
|
|||
+ sizeof(TriggerData)
|
||||
+ (image_buffer_count*sizeof(struct timeval))
|
||||
+ (image_buffer_count*camera->ImageSize())
|
||||
+ 64; /* Padding used to permit aligning the images buffer to 16 byte boundary */
|
||||
+ 64; /* Padding used to permit aligning the images buffer to 64 byte boundary */
|
||||
|
||||
Debug( 1, "mem.size=%d", mem_size );
|
||||
mem_ptr = NULL;
|
||||
|
@ -569,10 +569,10 @@ bool Monitor::connect() {
|
|||
struct timeval *shared_timestamps = (struct timeval *)((char *)trigger_data + sizeof(TriggerData));
|
||||
unsigned char *shared_images = (unsigned char *)((char *)shared_timestamps + (image_buffer_count*sizeof(struct timeval)));
|
||||
|
||||
if(((unsigned long)shared_images % 16) != 0) {
|
||||
/* Align images buffer to nearest 16 byte boundary */
|
||||
Debug(3,"Aligning shared memory images to the next 16 byte boundary");
|
||||
shared_images = (uint8_t*)((unsigned long)shared_images + (16 - ((unsigned long)shared_images % 16)));
|
||||
if(((unsigned long)shared_images % 64) != 0) {
|
||||
/* Align images buffer to nearest 64 byte boundary */
|
||||
Debug(3,"Aligning shared memory images to the next 64 byte boundary");
|
||||
shared_images = (uint8_t*)((unsigned long)shared_images + (64 - ((unsigned long)shared_images % 64)));
|
||||
}
|
||||
image_buffer = new Snapshot[image_buffer_count];
|
||||
for ( int i = 0; i < image_buffer_count; i++ )
|
||||
|
|
|
@ -24,12 +24,16 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#if defined(__arm__)
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CURL_CURL_H
|
||||
#include <curl/curl.h>
|
||||
#endif
|
||||
|
||||
unsigned int sseversion = 0;
|
||||
unsigned int neonversion = 0;
|
||||
|
||||
std::string trimSet(std::string str, std::string trimset) {
|
||||
// Trim Both leading and trailing sets
|
||||
|
@ -238,30 +242,59 @@ int pairsplit(const char* string, const char delim, std::string& name, std::stri
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Sets sse_version */
|
||||
void ssedetect() {
|
||||
/* Detect special hardware features, such as SIMD instruction sets */
|
||||
void hwcaps_detect() {
|
||||
neonversion = 0;
|
||||
sseversion = 0;
|
||||
#if (defined(__i386__) || defined(__x86_64__))
|
||||
/* x86 or x86-64 processor */
|
||||
uint32_t r_edx, r_ecx;
|
||||
|
||||
uint32_t r_edx, r_ecx, r_ebx;
|
||||
|
||||
#ifdef __x86_64__
|
||||
__asm__ __volatile__(
|
||||
#if defined(__i386__)
|
||||
"pushl %%ebx;\n\t"
|
||||
#endif
|
||||
"push %%rbx\n\t"
|
||||
"mov $0x0,%%ecx\n\t"
|
||||
"mov $0x7,%%eax\n\t"
|
||||
"cpuid\n\t"
|
||||
"push %%rbx\n\t"
|
||||
"mov $0x1,%%eax\n\t"
|
||||
"cpuid\n\t"
|
||||
#if defined(__i386__)
|
||||
"popl %%ebx;\n\t"
|
||||
#endif
|
||||
: "=d" (r_edx), "=c" (r_ecx)
|
||||
"pop %%rax\n\t"
|
||||
"pop %%rbx\n\t"
|
||||
: "=d" (r_edx), "=c" (r_ecx), "=a" (r_ebx)
|
||||
:
|
||||
:
|
||||
: "%eax"
|
||||
#if !defined(__i386__)
|
||||
, "%ebx"
|
||||
#endif
|
||||
);
|
||||
|
||||
if (r_ecx & 0x00000200) {
|
||||
#else
|
||||
__asm__ __volatile__(
|
||||
"push %%ebx\n\t"
|
||||
"mov $0x0,%%ecx\n\t"
|
||||
"mov $0x7,%%eax\n\t"
|
||||
"cpuid\n\t"
|
||||
"push %%ebx\n\t"
|
||||
"mov $0x1,%%eax\n\t"
|
||||
"cpuid\n\t"
|
||||
"pop %%eax\n\t"
|
||||
"pop %%ebx\n\t"
|
||||
: "=d" (r_edx), "=c" (r_ecx), "=a" (r_ebx)
|
||||
:
|
||||
:
|
||||
);
|
||||
#endif
|
||||
|
||||
if (r_ebx & 0x00000020) {
|
||||
sseversion = 52; /* AVX2 */
|
||||
Debug(1,"Detected a x86\\x86-64 processor with AVX2");
|
||||
} else if (r_ecx & 0x10000000) {
|
||||
sseversion = 51; /* AVX */
|
||||
Debug(1,"Detected a x86\\x86-64 processor with AVX");
|
||||
} else if (r_ecx & 0x00100000) {
|
||||
sseversion = 42; /* SSE4.2 */
|
||||
Debug(1,"Detected a x86\\x86-64 processor with SSE4.2");
|
||||
} else if (r_ecx & 0x00080000) {
|
||||
sseversion = 41; /* SSE4.1 */
|
||||
Debug(1,"Detected a x86\\x86-64 processor with SSE4.1");
|
||||
} else if (r_ecx & 0x00000200) {
|
||||
sseversion = 35; /* SSSE3 */
|
||||
Debug(1,"Detected a x86\\x86-64 processor with SSSE3");
|
||||
} else if (r_ecx & 0x00000001) {
|
||||
|
@ -276,12 +309,20 @@ void ssedetect() {
|
|||
} else {
|
||||
sseversion = 0;
|
||||
Debug(1,"Detected a x86\\x86-64 processor");
|
||||
}
|
||||
#elif defined(__arm__)
|
||||
// ARM processor
|
||||
// To see if it supports NEON, we need to get that information from the kernel
|
||||
unsigned long auxval = getauxval(AT_HWCAP);
|
||||
if (auxval & HWCAP_ARM_NEON) {
|
||||
Debug(1,"Detected ARM processor with Neon");
|
||||
neonversion = 1;
|
||||
} else {
|
||||
Debug(1,"Detected ARM processor");
|
||||
}
|
||||
|
||||
#else
|
||||
/* Non x86 or x86-64 processor, SSE2 is not available */
|
||||
Debug(1,"Detected a non x86\\x86-64 processor");
|
||||
sseversion = 0;
|
||||
// Unknown processor
|
||||
Debug(1,"Detected unknown processor architecture");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -54,11 +54,12 @@ inline int min( int a, int b )
|
|||
return( a<=b?a:b );
|
||||
}
|
||||
|
||||
void ssedetect();
|
||||
void* sse2_aligned_memcpy(void* dest, const void* src, size_t bytes);
|
||||
void timespec_diff(struct timespec *start, struct timespec *end, struct timespec *diff);
|
||||
|
||||
void hwcaps_detect();
|
||||
extern unsigned int sseversion;
|
||||
extern unsigned int neonversion;
|
||||
|
||||
std::string UriDecode( const std::string &encoded );
|
||||
|
||||
|
|
|
@ -133,7 +133,7 @@ int main( int argc, char *argv[] )
|
|||
|
||||
logInit( log_id_string );
|
||||
|
||||
ssedetect();
|
||||
hwcaps_detect();
|
||||
|
||||
Monitor *monitor = Monitor::Load( id, true, Monitor::ANALYSIS );
|
||||
|
||||
|
|
|
@ -206,7 +206,7 @@ int main( int argc, char *argv[] )
|
|||
|
||||
logInit( log_id_string );
|
||||
|
||||
ssedetect();
|
||||
hwcaps_detect();
|
||||
|
||||
Monitor **monitors = 0;
|
||||
int n_monitors = 0;
|
||||
|
|
|
@ -91,7 +91,7 @@ int main( int argc, const char *argv[] )
|
|||
|
||||
logInit( "zms" );
|
||||
|
||||
ssedetect();
|
||||
hwcaps_detect();
|
||||
|
||||
zmSetDefaultTermHandler();
|
||||
zmSetDefaultDieHandler();
|
||||
|
|
Loading…
Reference in New Issue