From c6d4b175c275602cb5b1d0be6123b656dc0bbb27 Mon Sep 17 00:00:00 2001
From: Bond_009 <bond.009@outlook.com>
Date: Tue, 5 Jan 2021 16:00:08 +0100
Subject: Optimize repair_avx_inner some more

---
 1/part2_fast.c   | 10 ++--------
 1/repair_avx.asm | 18 +++++++-----------
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/1/part2_fast.c b/1/part2_fast.c
index 67cd6c5..f4f6f3a 100644
--- a/1/part2_fast.c
+++ b/1/part2_fast.c
@@ -16,14 +16,8 @@ int repair_avx_inner(int i, const int *arr, __m256i search)
     for (int k = 0; k < INPUT_LEN; k += 8) {
         __m256i new = _mm256_loadu_si256((__m256i *)(&arr[k]));
         int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(new, cmp));
-        if (mask == 0) {
-            continue;
-        }
-
-        for (int l = 0; l < 8; l++) {
-            if (mask & (0xf << (l * 4))) {
-                return arr[k + l];
-            }
+        if (mask != 0) {
+            return _mm256_extract_epi32(cmp, 0);
         }
     }
 
diff --git a/1/repair_avx.asm b/1/repair_avx.asm
index 4a268a6..7271138 100644
--- a/1/repair_avx.asm
+++ b/1/repair_avx.asm
@@ -7,22 +7,18 @@ repair_avx_inner:
     vmovd           xmm1, edi
     vpbroadcastd    ymm1, xmm1
     vpsubd          ymm1, ymm0, ymm1
-%rep    24
-    vpcmpeqd        ymm2, ymm1, [rsi]
-    vpmovmskb       edx, ymm2
-    test            edx, edx
+%assign i 0
+%rep    25
+    vpcmpeqd        ymm2, ymm1, [rsi + i]
+    vpmovmskb       eax, ymm2
+    test            eax, eax
     jne             .found
-    add             rsi, 32                 ; set up to read the next 256 bits (32 bytes) (8 * dword)
+%assign i i+32
 %endrep
-    vpcmpeqd        ymm2, ymm1, [rsi]
-    vpmovmskb       edx, ymm2
-    test            edx, edx
-    jne             .found
     xor             eax, eax                ; not found, return 0
     vzeroupper                              ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions
     ret
 .found:
-    bsf             edx, edx
-    mov             eax, dword [rsi + rdx]
     vzeroupper                              ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions
+    movd            eax, xmm1
     ret
-- 
cgit v1.2.3