From c6d4b175c275602cb5b1d0be6123b656dc0bbb27 Mon Sep 17 00:00:00 2001 From: Bond_009 Date: Tue, 5 Jan 2021 16:00:08 +0100 Subject: Optimize repair_avx_inner some more --- 1/part2_fast.c | 10 ++-------- 1/repair_avx.asm | 18 +++++++----------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/1/part2_fast.c b/1/part2_fast.c index 67cd6c5..f4f6f3a 100644 --- a/1/part2_fast.c +++ b/1/part2_fast.c @@ -16,14 +16,8 @@ int repair_avx_inner(int i, const int *arr, __m256i search) for (int k = 0; k < INPUT_LEN; k += 8) { __m256i new = _mm256_loadu_si256((__m256i *)(&arr[k])); int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(new, cmp)); - if (mask == 0) { - continue; - } - - for (int l = 0; l < 8; l++) { - if (mask & (0xf << (l * 4))) { - return arr[k + l]; - } + if (mask != 0) { + return _mm256_extract_epi32(cmp, 0); } } diff --git a/1/repair_avx.asm b/1/repair_avx.asm index 4a268a6..7271138 100644 --- a/1/repair_avx.asm +++ b/1/repair_avx.asm @@ -7,22 +7,18 @@ repair_avx_inner: vmovd xmm1, edi vpbroadcastd ymm1, xmm1 vpsubd ymm1, ymm0, ymm1 -%rep 24 - vpcmpeqd ymm2, ymm1, [rsi] - vpmovmskb edx, ymm2 - test edx, edx +%assign i 0 +%rep 25 + vpcmpeqd ymm2, ymm1, [rsi + i] + vpmovmskb eax, ymm2 + test eax, eax jne .found - add rsi, 32 ; set up to read the next 256 bits (32 bytes) (8 * dword) +%assign i i+32 %endrep - vpcmpeqd ymm2, ymm1, [rsi] - vpmovmskb edx, ymm2 - test edx, edx - jne .found xor eax, eax ; not found, return 0 vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions ret .found: - bsf edx, edx - mov eax, dword [rsi + rdx] vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions + movd eax, xmm1 ret -- cgit v1.2.3