summaryrefslogtreecommitdiff
path: root/1
diff options
context:
space:
mode:
Diffstat (limited to '1')
-rw-r--r--1/part2_fast.c10
-rw-r--r--1/repair_avx.asm18
2 files changed, 9 insertions, 19 deletions
diff --git a/1/part2_fast.c b/1/part2_fast.c
index 67cd6c5..f4f6f3a 100644
--- a/1/part2_fast.c
+++ b/1/part2_fast.c
@@ -16,14 +16,8 @@ int repair_avx_inner(int i, const int *arr, __m256i search)
for (int k = 0; k < INPUT_LEN; k += 8) {
__m256i new = _mm256_loadu_si256((__m256i *)(&arr[k]));
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(new, cmp));
- if (mask == 0) {
- continue;
- }
-
- for (int l = 0; l < 8; l++) {
- if (mask & (0xf << (l * 4))) {
- return arr[k + l];
- }
+ if (mask != 0) {
+ return _mm256_extract_epi32(cmp, 0);
}
}
diff --git a/1/repair_avx.asm b/1/repair_avx.asm
index 4a268a6..7271138 100644
--- a/1/repair_avx.asm
+++ b/1/repair_avx.asm
@@ -7,22 +7,18 @@ repair_avx_inner:
vmovd xmm1, edi
vpbroadcastd ymm1, xmm1
vpsubd ymm1, ymm0, ymm1
-%rep 24
- vpcmpeqd ymm2, ymm1, [rsi]
- vpmovmskb edx, ymm2
- test edx, edx
+%assign i 0
+%rep 25
+ vpcmpeqd ymm2, ymm1, [rsi + i]
+ vpmovmskb eax, ymm2
+ test eax, eax
jne .found
- add rsi, 32 ; set up to read the next 256 bits (32 bytes) (8 * dword)
+%assign i i+32
%endrep
- vpcmpeqd ymm2, ymm1, [rsi]
- vpmovmskb edx, ymm2
- test edx, edx
- jne .found
xor eax, eax ; not found, return 0
vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions
ret
.found:
- bsf edx, edx
- mov eax, dword [rsi + rdx]
vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions
+ movd eax, xmm1
ret