Move ARM Neon data prefetches to be after the loads
This commit is contained in:
parent
c66deb564f
commit
5df7499da4
|
@ -3428,10 +3428,10 @@ void neon32_armv7_fastblend(const uint8_t* col1, const uint8_t* col2, uint8_t* r
|
|||
"mov r12, %4\n\t"
|
||||
"vdup.8 q12, r12\n\t"
|
||||
"neon32_armv7_fastblend_iter:\n\t"
|
||||
"pld [%0,#256]\n\t"
|
||||
"pld [%1,#256]\n\t"
|
||||
"vldm %0!, {q0,q1,q2,q3}\n\t"
|
||||
"vldm %1!, {q4,q5,q6,q7}\n\t"
|
||||
"pld [%0, #256]\n\t"
|
||||
"pld [%1, #256]\n\t"
|
||||
"vrshl.u8 q8, q0, q12\n\t"
|
||||
"vrshl.u8 q9, q1, q12\n\t"
|
||||
"vrshl.u8 q10, q2, q12\n\t"
|
||||
|
@ -3509,12 +3509,12 @@ __attribute__((noinline)) void neon64_armv8_fastblend(const uint8_t* col1, const
|
|||
"mov x12, %4\n\t"
|
||||
"dup v28.16b, w12\n\t"
|
||||
"neon64_armv8_fastblend_iter:\n\t"
|
||||
"prfm pldl1keep, [%0, #256]\n\t"
|
||||
"prfm pldl1keep, [%1, #256]\n\t"
|
||||
"ldp q16, q17, [%0], #32\n\t"
|
||||
"ldp q18, q19, [%0], #32\n\t"
|
||||
"ldp q20, q21, [%1], #32\n\t"
|
||||
"ldp q22, q23, [%1], #32\n\t"
|
||||
"prfm pldl1keep, [%0, #256]\n\t"
|
||||
"prfm pldl1keep, [%1, #256]\n\t"
|
||||
"urshl v24.16b, v16.16b, v28.16b\n\t"
|
||||
"urshl v25.16b, v17.16b, v28.16b\n\t"
|
||||
"urshl v26.16b, v18.16b, v28.16b\n\t"
|
||||
|
@ -3784,10 +3784,10 @@ void neon32_armv7_delta8_gray8(const uint8_t* col1, const uint8_t* col2, uint8_t
|
|||
|
||||
__asm__ __volatile__ (
|
||||
"neon32_armv7_delta8_gray8_iter:\n\t"
|
||||
"pld [%0,#256]\n\t"
|
||||
"pld [%1,#256]\n\t"
|
||||
"vldm %0!, {q0,q1,q2,q3}\n\t"
|
||||
"vldm %1!, {q4,q5,q6,q7}\n\t"
|
||||
"pld [%0, #512]\n\t"
|
||||
"pld [%1, #512]\n\t"
|
||||
"vabd.u8 q0, q0, q4\n\t"
|
||||
"vabd.u8 q1, q1, q5\n\t"
|
||||
"vabd.u8 q2, q2, q6\n\t"
|
||||
|
@ -3819,12 +3819,12 @@ __attribute__((noinline)) void neon64_armv8_delta8_gray8(const uint8_t* col1, co
|
|||
|
||||
__asm__ __volatile__ (
|
||||
"neon64_armv8_delta8_gray8_iter:\n\t"
|
||||
"prfm pldl1keep, [%0, #512]\n\t"
|
||||
"prfm pldl1keep, [%1, #512]\n\t"
|
||||
"ldp q16, q17, [%0], #32\n\t"
|
||||
"ldp q18, q19, [%0], #32\n\t"
|
||||
"ldp q20, q21, [%1], #32\n\t"
|
||||
"ldp q22, q23, [%1], #32\n\t"
|
||||
"prfm pldl1keep, [%0, #512]\n\t"
|
||||
"prfm pldl1keep, [%1, #512]\n\t"
|
||||
"uabd v16.16b, v16.16b, v20.16b\n\t"
|
||||
"uabd v17.16b, v17.16b, v21.16b\n\t"
|
||||
"uabd v18.16b, v18.16b, v22.16b\n\t"
|
||||
|
@ -3863,10 +3863,10 @@ void neon32_armv7_delta8_rgb32(const uint8_t* col1, const uint8_t* col2, uint8_t
|
|||
"mov r12, %4\n\t"
|
||||
"vdup.32 q8, r12\n\t"
|
||||
"neon32_armv7_delta8_rgb32_iter:\n\t"
|
||||
"pld [%0,#256]\n\t"
|
||||
"pld [%1,#256]\n\t"
|
||||
"vldm %0!, {q0,q1,q2,q3}\n\t"
|
||||
"vldm %1!, {q4,q5,q6,q7}\n\t"
|
||||
"pld [%0, #256]\n\t"
|
||||
"pld [%1, #256]\n\t"
|
||||
"vabd.u8 q0, q0, q4\n\t"
|
||||
"vabd.u8 q1, q1, q5\n\t"
|
||||
"vabd.u8 q2, q2, q6\n\t"
|
||||
|
@ -3918,12 +3918,12 @@ __attribute__((noinline)) void neon64_armv8_delta8_rgb32(const uint8_t* col1, co
|
|||
"mov x12, %4\n\t"
|
||||
"dup v24.4s, w12\n\t"
|
||||
"neon64_armv8_delta8_rgb32_iter:\n\t"
|
||||
"prfm pldl1keep, [%0, #256]\n\t"
|
||||
"prfm pldl1keep, [%1, #256]\n\t"
|
||||
"ldp q16, q17, [%0], #32\n\t"
|
||||
"ldp q18, q19, [%0], #32\n\t"
|
||||
"ldp q20, q21, [%1], #32\n\t"
|
||||
"ldp q22, q23, [%1], #32\n\t"
|
||||
"prfm pldl1keep, [%0, #256]\n\t"
|
||||
"prfm pldl1keep, [%1, #256]\n\t"
|
||||
"uabd v16.16b, v16.16b, v20.16b\n\t"
|
||||
"uabd v17.16b, v17.16b, v21.16b\n\t"
|
||||
"uabd v18.16b, v18.16b, v22.16b\n\t"
|
||||
|
|
Loading…
Reference in New Issue