Move ARM Neon data prefetches to be after the loads

This commit is contained in:
Kfir Itzhak 2017-05-13 19:08:18 +03:00
parent c66deb564f
commit 5df7499da4
1 changed files with 12 additions and 12 deletions

View File

@ -3428,10 +3428,10 @@ void neon32_armv7_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* r
"mov r12, %4\n\t"
"vdup.8 q12, r12\n\t"
"neon32_armv7_fastblend_iter:\n\t"
"pld [%0,#256]\n\t"
"pld [%1,#256]\n\t"
"vldm %0!, {q0,q1,q2,q3}\n\t"
"vldm %1!, {q4,q5,q6,q7}\n\t"
"pld [%0, #256]\n\t"
"pld [%1, #256]\n\t"
"vrshl.u8 q8, q0, q12\n\t"
"vrshl.u8 q9, q1, q12\n\t"
"vrshl.u8 q10, q2, q12\n\t"
@ -3509,12 +3509,12 @@ __attribute__((noinline)) void neon64_armv8_fastblend(const uint8_t* col1, const
"mov x12, %4\n\t"
"dup v28.16b, w12\n\t"
"neon64_armv8_fastblend_iter:\n\t"
"prfm pldl1keep, [%0, #256]\n\t"
"prfm pldl1keep, [%1, #256]\n\t"
"ldp q16, q17, [%0], #32\n\t"
"ldp q18, q19, [%0], #32\n\t"
"ldp q20, q21, [%1], #32\n\t"
"ldp q22, q23, [%1], #32\n\t"
"prfm pldl1keep, [%0, #256]\n\t"
"prfm pldl1keep, [%1, #256]\n\t"
"urshl v24.16b, v16.16b, v28.16b\n\t"
"urshl v25.16b, v17.16b, v28.16b\n\t"
"urshl v26.16b, v18.16b, v28.16b\n\t"
@ -3784,10 +3784,10 @@ void neon32_armv7_delta8_gray8(const uint8_t* col1, const uint8_t* col2, uint8_t
__asm__ __volatile__ (
"neon32_armv7_delta8_gray8_iter:\n\t"
"pld [%0,#256]\n\t"
"pld [%1,#256]\n\t"
"vldm %0!, {q0,q1,q2,q3}\n\t"
"vldm %1!, {q4,q5,q6,q7}\n\t"
"pld [%0, #512]\n\t"
"pld [%1, #512]\n\t"
"vabd.u8 q0, q0, q4\n\t"
"vabd.u8 q1, q1, q5\n\t"
"vabd.u8 q2, q2, q6\n\t"
@ -3819,12 +3819,12 @@ __attribute__((noinline)) void neon64_armv8_delta8_gray8(const uint8_t* col1, co
__asm__ __volatile__ (
"neon64_armv8_delta8_gray8_iter:\n\t"
"prfm pldl1keep, [%0, #512]\n\t"
"prfm pldl1keep, [%1, #512]\n\t"
"ldp q16, q17, [%0], #32\n\t"
"ldp q18, q19, [%0], #32\n\t"
"ldp q20, q21, [%1], #32\n\t"
"ldp q22, q23, [%1], #32\n\t"
"prfm pldl1keep, [%0, #512]\n\t"
"prfm pldl1keep, [%1, #512]\n\t"
"uabd v16.16b, v16.16b, v20.16b\n\t"
"uabd v17.16b, v17.16b, v21.16b\n\t"
"uabd v18.16b, v18.16b, v22.16b\n\t"
@ -3863,10 +3863,10 @@ void neon32_armv7_delta8_rgb32(const uint8_t* col1, const uint8_t* col2, uint8_t
"mov r12, %4\n\t"
"vdup.32 q8, r12\n\t"
"neon32_armv7_delta8_rgb32_iter:\n\t"
"pld [%0,#256]\n\t"
"pld [%1,#256]\n\t"
"vldm %0!, {q0,q1,q2,q3}\n\t"
"vldm %1!, {q4,q5,q6,q7}\n\t"
"pld [%0, #256]\n\t"
"pld [%1, #256]\n\t"
"vabd.u8 q0, q0, q4\n\t"
"vabd.u8 q1, q1, q5\n\t"
"vabd.u8 q2, q2, q6\n\t"
@ -3918,12 +3918,12 @@ __attribute__((noinline)) void neon64_armv8_delta8_rgb32(const uint8_t* col1, co
"mov x12, %4\n\t"
"dup v24.4s, w12\n\t"
"neon64_armv8_delta8_rgb32_iter:\n\t"
"prfm pldl1keep, [%0, #256]\n\t"
"prfm pldl1keep, [%1, #256]\n\t"
"ldp q16, q17, [%0], #32\n\t"
"ldp q18, q19, [%0], #32\n\t"
"ldp q20, q21, [%1], #32\n\t"
"ldp q22, q23, [%1], #32\n\t"
"prfm pldl1keep, [%0, #256]\n\t"
"prfm pldl1keep, [%1, #256]\n\t"
"uabd v16.16b, v16.16b, v20.16b\n\t"
"uabd v17.16b, v17.16b, v21.16b\n\t"
"uabd v18.16b, v18.16b, v22.16b\n\t"