diff options
| -rw-r--r-- | 1/part2_fast.c | 23 | ||||
| -rw-r--r-- | 1/repair_avx.asm | 29 |
2 files changed, 43 insertions, 9 deletions
diff --git a/1/part2_fast.c b/1/part2_fast.c index 4523c65..ffb4504 100644 --- a/1/part2_fast.c +++ b/1/part2_fast.c @@ -7,7 +7,10 @@ #define INPUT_LEN 200 #define SEARCH 2020 -int inner_repair_avx(int i, const int *arr, __m256i search) +#ifdef USE_ASM +int repair_avx_inner(int i, const int *arr, __m256i search); +#else +int repair_avx_inner(int i, const int *arr, __m256i search) { __m256i start = _mm256_set1_epi32(i); for (int k = 0; k < INPUT_LEN; k += 8) { @@ -27,6 +30,7 @@ int inner_repair_avx(int i, const int *arr, __m256i search) return 0; } +#endif int repair_avx(const int *arr) { @@ -54,7 +58,7 @@ int repair_avx(const int *arr) label0: if ((mask & 0x0000000f) == 0) { - int tmp = inner_repair_avx(_mm256_extract_epi32(new, 0), arr, search); + int tmp = repair_avx_inner(_mm256_extract_epi32(new, 0), arr, search); if (tmp) { return tmp * arr[i] * arr[j]; } @@ -62,7 +66,7 @@ int repair_avx(const int *arr) label1: if ((mask & 0x000000f0) == 0) { - int tmp = inner_repair_avx(_mm256_extract_epi32(new, 1), arr, search); + int tmp = repair_avx_inner(_mm256_extract_epi32(new, 1), arr, search); if (tmp) { return tmp * arr[i] * arr[j + 1]; } @@ -70,7 +74,7 @@ int repair_avx(const int *arr) label2: if ((mask & 0x00000f00) == 0) { - int tmp = inner_repair_avx(_mm256_extract_epi32(new, 2), arr, search); + int tmp = repair_avx_inner(_mm256_extract_epi32(new, 2), arr, search); if (tmp) { return tmp * arr[i] * arr[j + 2]; } @@ -78,7 +82,7 @@ int repair_avx(const int *arr) label3: if ((mask & 0x0000f000) == 0) { - int tmp = inner_repair_avx(_mm256_extract_epi32(new, 3), arr, search); + int tmp = repair_avx_inner(_mm256_extract_epi32(new, 3), arr, search); if (tmp) { return tmp * arr[i] * arr[j + 3]; } @@ -86,7 +90,7 @@ int repair_avx(const int *arr) label4: if ((mask & 0x000f0000) == 0) { - int tmp = inner_repair_avx(_mm256_extract_epi32(new, 4), arr, search); + int tmp = repair_avx_inner(_mm256_extract_epi32(new, 4), arr, search); if (tmp) { return tmp * arr[i] * arr[j + 4]; } @@ -94,7 +98,7 @@ int repair_avx(const int *arr) label5: if ((mask & 0x00f00000) == 0 ){ - int tmp = inner_repair_avx(_mm256_extract_epi32(new, 5), arr, search); + int tmp = repair_avx_inner(_mm256_extract_epi32(new, 5), arr, search); if (tmp) { return tmp * arr[i] * arr[j + 5]; } @@ -102,7 +106,7 @@ int repair_avx(const int *arr) label6: if ((mask & 0x0f000000) == 0) { - int tmp = inner_repair_avx(_mm256_extract_epi32(new, 6), arr, search); + int tmp = repair_avx_inner(_mm256_extract_epi32(new, 6), arr, search); if (tmp) { return tmp * arr[i] * arr[j + 6]; } @@ -110,7 +114,7 @@ int repair_avx(const int *arr) label7: if ((mask & 0xf0000000) == 0) { - int tmp = inner_repair_avx(_mm256_extract_epi32(new, 7), arr, search); + int tmp = repair_avx_inner(_mm256_extract_epi32(new, 7), arr, search); if (tmp) { return tmp * arr[i] * arr[j + 7]; } @@ -121,6 +125,7 @@ int repair_avx(const int *arr) return 0; } + int main(int argc, char *argv[]) { FILE *file = fopen(argv[argc - 1], "r"); diff --git a/1/repair_avx.asm b/1/repair_avx.asm new file mode 100644 index 0000000..777e292 --- /dev/null +++ b/1/repair_avx.asm @@ -0,0 +1,29 @@ +global repair_avx_inner + +section .text + +repair_avx_inner: +; vpbroadcastd ymm1, edi ; AVX512VL AVX512F + vmovd xmm1, edi + vpbroadcastd ymm1, xmm1 +%rep 24 + vpaddd ymm2, ymm1, [rsi] + vpcmpeqd ymm2, ymm2, ymm0 + vpmovmskb edx, ymm2 + test edx, edx + jne .found + add rsi, 32 ; set up to read the next 256 bits (32 bytes) (8 * dword) +%endrep + vpaddd ymm2, ymm1, [rsi] + vpcmpeqd ymm2, ymm2, ymm0 + vpmovmskb edx, ymm2 + test edx, edx + jne .found + xor eax, eax ; not found, return 0 + vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions + ret +.found: + bsf edx, edx + mov eax, dword [rsi + rdx] + vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions + ret |
