diff options
| author | Bond_009 <bond.009@outlook.com> | 2021-01-06 16:08:35 +0100 |
|---|---|---|
| committer | Bond_009 <bond.009@outlook.com> | 2021-01-06 16:08:35 +0100 |
| commit | 04e1607c944b5a133f20e5cd1f213e0d2da0702b (patch) | |
| tree | 4972804475168743f178fbca3dd3b0cfb2e035c2 /1/part2_fast.c | |
| parent | c6d4b175c275602cb5b1d0be6123b656dc0bbb27 (diff) | |
Clean up day 1 fast
Diffstat (limited to '1/part2_fast.c')
| -rw-r--r-- | 1/part2_fast.c | 149 |
1 files changed, 69 insertions, 80 deletions
diff --git a/1/part2_fast.c b/1/part2_fast.c index f4f6f3a..c32f881 100644 --- a/1/part2_fast.c +++ b/1/part2_fast.c @@ -1,4 +1,3 @@ -#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -8,16 +7,14 @@ #define SEARCH 2020 #ifdef USE_ASM -int repair_avx_inner(int i, const int *arr, __m256i search); +int repair_avx_inner(const int *arr, __m256i search); #else -int repair_avx_inner(int i, const int *arr, __m256i search) +int repair_avx_inner(const int *arr, __m256i search) { - __m256i cmp = _mm256_sub_epi32(search, _mm256_set1_epi32(i)); for (int k = 0; k < INPUT_LEN; k += 8) { __m256i new = _mm256_loadu_si256((__m256i *)(&arr[k])); - int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(new, cmp)); - if (mask != 0) { - return _mm256_extract_epi32(cmp, 0); + if (_mm256_movemask_epi8(_mm256_cmpeq_epi32(new, search))) { + return _mm256_extract_epi32(search, 0); } } @@ -33,84 +30,76 @@ int repair_avx(const int *arr) for (int j = 0; j < INPUT_LEN; j += 8) { __m256i new = _mm256_loadu_si256((__m256i *)(&arr[j])); new = _mm256_add_epi32(start, new); - int mask = _mm256_movemask_epi8(_mm256_cmpgt_epi32(new, search)); + unsigned int mask = (unsigned int)_mm256_movemask_epi8(_mm256_cmpgt_epi32(new, search)); if (mask == 0xffffffff) { continue; } switch (__lzcnt32(mask) / 4) { - case 0: goto label0; - case 1: goto label1; - case 2: goto label2; - case 3: goto label3; - case 4: goto label4; - case 5: goto label5; - case 6: goto label6; - case 7: goto label7; - } - - label0: - if ((mask & 0x0000000f) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 0), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j]; - } - } - - label1: - if ((mask & 0x000000f0) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 1), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 1]; - } - } - - label2: - if ((mask & 0x00000f00) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 2), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 2]; - } - } - - label3: - if ((mask & 0x0000f000) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 3), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 3]; - } - } - - label4: - if ((mask & 0x000f0000) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 4), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 4]; - } - } - - label5: - if ((mask & 0x00f00000) == 0 ){ - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 5), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 5]; - } - } - - label6: - if ((mask & 0x0f000000) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 6), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 6]; - } - } - - label7: - if ((mask & 0xf0000000) == 0) { - int tmp = repair_avx_inner(_mm256_extract_epi32(new, 7), arr, search); - if (tmp) { - return tmp * arr[i] * arr[j + 7]; - } + case 0: + { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(7))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 7]; + } + } + case 1: + if ((mask & 0x0f000000) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(6))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 6]; + } + } + case 2: + if ((mask & 0x00f00000) == 0 ){ + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(5))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 5]; + } + } + case 3: + if ((mask & 0x000f0000) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(4))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 4]; + } + } + case 4: + if ((mask & 0x0000f000) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(3))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 3]; + } + } + case 5: + if ((mask & 0x00000f00) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(2))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 2]; + } + } + case 6: + if ((mask & 0x000000f0) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(1))); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j + 1]; + } + } + case 7: + if ((mask & 0x0000000f) == 0) { + __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_setzero_si256())); + int tmp = repair_avx_inner(arr, cmp); + if (tmp) { + return tmp * arr[i] * arr[j]; + } + } } } } |
