blob: 4f128f63ae42cc1b7658bd788a1723c11b4a7222 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
global repair_avx_inner
section .text
repair_avx_inner:
%assign i 0
%rep 25
vpcmpeqd ymm1, ymm0, [rdi + i]
; vptest ymm1, ymm1 ; slower then vpmovmskb + test
vpmovmskb eax, ymm1
test eax, eax
jne .found
%assign i i+32
%endrep
xor eax, eax ; not found, return 0
vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions
ret
.found:
vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions
movd eax, xmm0 ; smaller then putting a vmovd before the vzeroupper and no measurable performance difference
ret
|