global repair_avx_inner section .text repair_avx_inner: %assign i 0 %rep 25 vpcmpeqd ymm1, ymm0, [rdi + i] ; vptest ymm1, ymm1 ; slower then vpmovmskb + test vpmovmskb eax, ymm1 test eax, eax jne .found %assign i i+32 %endrep xor eax, eax ; not found, return 0 vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions ret .found: vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions movd eax, xmm0 ; smaller then putting a vmovd before the vzeroupper and no measurable performance difference ret