diff options
Diffstat (limited to '1')
| -rw-r--r-- | 1/part2_fast.c | 149 | ||||
| -rw-r--r-- | 1/repair_avx.asm | 11 |
2 files changed, 73 insertions, 87 deletions
diff --git a/1/part2_fast.c b/1/part2_fast.c index f4f6f3a..c32f881 100644 --- a/1/part2_fast.c +++ b/1/part2_fast.c @@ -1,4 +1,3 @@ -#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -8,16 +7,14 @@ #define SEARCH 2020 #ifdef USE_ASM -int repair_avx_inner(int i, const int *arr, __m256i search); +int repair_avx_inner(const int *arr, __m256i search); #else -int repair_avx_inner(int i, const int *arr, __m256i search) +int repair_avx_inner(const int *arr, __m256i search) { - __m256i cmp = _mm256_sub_epi32(search, _mm256_set1_epi32(i)); for (int k = 0; k < INPUT_LEN; k += 8) { __m256i new = _mm256_loadu_si256((__m256i *)(&arr[k])); - int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(new, cmp)); - if (mask != 0) { - return _mm256_extract_epi32(cmp, 0); + if (_mm256_movemask_epi8(_mm256_cmpeq_epi32(new, search))) { + return _mm256_extract_epi32(search, 0); } } @@ -33,84 +30,76 @@ int repair_avx(const int *arr) for (int j = 0; j < INPUT_LEN; j += 8) { __m256i new = _mm256_loadu_si256((__m256i *)(&arr[j])); new = _mm256_add_epi32(start, new); - int mask = _mm256_movemask_epi8(_mm256_cmpgt_epi32(new, search)); + unsigned int mask = (unsigned int)_mm256_movemask_epi8(_mm256_cmpgt_epi32(new, search)); if (mask == 0xffffffff) { continue; } switch (__lzcnt32(mask) / 4) { - case 0: goto label0; - case 1: goto label1; - case 2: goto label2; - case 3: goto label3; - case 4: goto label4; - case 5: goto label5; - case 6: goto label6; - case 7: goto label7; - } - - label0: - if ((mask & 0x0000000f) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 0), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j]; - } - } - - label1: - if ((mask & 0x000000f0) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 1), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 1]; - } - } - - label2: - if ((mask & 0x00000f00) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 2), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 2]; - } - } - - label3: - if ((mask & 0x0000f000) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 3), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 3]; - } - } - - label4: - if ((mask & 0x000f0000) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 4), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 4]; - } - } - - label5: - if ((mask & 0x00f00000) == 0 ){ - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 5), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 5]; - } - } - - label6: - if ((mask & 0x0f000000) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 6), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 6]; - } - } - - label7: - if ((mask & 0xf0000000) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 7), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 7]; - } + case 0: + { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(7))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 7]; + } + } + case 1: + if ((mask & 0x0f000000) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(6))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 6]; + } + } + case 2: + if ((mask & 0x00f00000) == 0 ){ + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(5))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 5]; + } + } + case 3: + if ((mask & 0x000f0000) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(4))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 4]; + } + } + case 4: + if ((mask & 0x0000f000) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(3))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 3]; + } + } + case 5: + if ((mask & 0x00000f00) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(2))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 2]; + } + } + case 6: + if ((mask & 0x000000f0) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(1))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 1]; + } + } + case 7: + if ((mask & 0x0000000f) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_setzero_si256())); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j]; + } + } } } } diff --git a/1/repair_avx.asm b/1/repair_avx.asm index 7271138..4f128f6 100644 --- a/1/repair_avx.asm +++ b/1/repair_avx.asm @@ -3,14 +3,11 @@ global repair_avx_inner section .text repair_avx_inner: -; vpbroadcastd ymm1, edi ; AVX512VL AVX512F - vmovd xmm1, edi - vpbroadcastd ymm1, xmm1 - vpsubd ymm1, ymm0, ymm1 %assign i 0 %rep 25 - vpcmpeqd ymm2, ymm1, [rsi + i] - vpmovmskb eax, ymm2 + vpcmpeqd ymm1, ymm0, [rdi + i] +; vptest ymm1, ymm1 ; slower then vpmovmskb + test + vpmovmskb eax, ymm1 test eax, eax jne .found %assign i i+32 @@ -20,5 +17,5 @@ repair_avx_inner: ret .found: vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions - movd eax, xmm1 + movd eax, xmm0 ; smaller then putting a vmovd before the vzeroupper and no measurable performance difference ret |
