summaryrefslogtreecommitdiff
path: root/1/part2_fast.c
diff options
context:
space:
mode:
Diffstat (limited to '1/part2_fast.c')
-rw-r--r--1/part2_fast.c149
1 files changed, 69 insertions, 80 deletions
diff --git a/1/part2_fast.c b/1/part2_fast.c
index f4f6f3a..c32f881 100644
--- a/1/part2_fast.c
+++ b/1/part2_fast.c
@@ -1,4 +1,3 @@
-#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -8,16 +7,14 @@
#define SEARCH 2020
#ifdef USE_ASM
-int repair_avx_inner(int i, const int *arr, __m256i search);
+int repair_avx_inner(const int *arr, __m256i search);
#else
-int repair_avx_inner(int i, const int *arr, __m256i search)
+int repair_avx_inner(const int *arr, __m256i search)
{
- __m256i cmp = _mm256_sub_epi32(search, _mm256_set1_epi32(i));
for (int k = 0; k < INPUT_LEN; k += 8) {
__m256i new = _mm256_loadu_si256((__m256i *)(&arr[k]));
- int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(new, cmp));
- if (mask != 0) {
- return _mm256_extract_epi32(cmp, 0);
+ if (_mm256_movemask_epi8(_mm256_cmpeq_epi32(new, search))) {
+ return _mm256_extract_epi32(search, 0);
}
}
@@ -33,84 +30,76 @@ int repair_avx(const int *arr)
for (int j = 0; j < INPUT_LEN; j += 8) {
__m256i new = _mm256_loadu_si256((__m256i *)(&arr[j]));
new = _mm256_add_epi32(start, new);
- int mask = _mm256_movemask_epi8(_mm256_cmpgt_epi32(new, search));
+ unsigned int mask = (unsigned int)_mm256_movemask_epi8(_mm256_cmpgt_epi32(new, search));
if (mask == 0xffffffff) {
continue;
}
switch (__lzcnt32(mask) / 4) {
- case 0: goto label0;
- case 1: goto label1;
- case 2: goto label2;
- case 3: goto label3;
- case 4: goto label4;
- case 5: goto label5;
- case 6: goto label6;
- case 7: goto label7;
- }
-
- label0:
- if ((mask & 0x0000000f) == 0) {
- int tmp = repair_avx_inner(_mm256_extract_epi32(new, 0), arr, search);
- if (tmp) {
- return tmp * arr[i] * arr[j];
- }
- }
-
- label1:
- if ((mask & 0x000000f0) == 0) {
- int tmp = repair_avx_inner(_mm256_extract_epi32(new, 1), arr, search);
- if (tmp) {
- return tmp * arr[i] * arr[j + 1];
- }
- }
-
- label2:
- if ((mask & 0x00000f00) == 0) {
- int tmp = repair_avx_inner(_mm256_extract_epi32(new, 2), arr, search);
- if (tmp) {
- return tmp * arr[i] * arr[j + 2];
- }
- }
-
- label3:
- if ((mask & 0x0000f000) == 0) {
- int tmp = repair_avx_inner(_mm256_extract_epi32(new, 3), arr, search);
- if (tmp) {
- return tmp * arr[i] * arr[j + 3];
- }
- }
-
- label4:
- if ((mask & 0x000f0000) == 0) {
- int tmp = repair_avx_inner(_mm256_extract_epi32(new, 4), arr, search);
- if (tmp) {
- return tmp * arr[i] * arr[j + 4];
- }
- }
-
- label5:
- if ((mask & 0x00f00000) == 0 ){
- int tmp = repair_avx_inner(_mm256_extract_epi32(new, 5), arr, search);
- if (tmp) {
- return tmp * arr[i] * arr[j + 5];
- }
- }
-
- label6:
- if ((mask & 0x0f000000) == 0) {
- int tmp = repair_avx_inner(_mm256_extract_epi32(new, 6), arr, search);
- if (tmp) {
- return tmp * arr[i] * arr[j + 6];
- }
- }
-
- label7:
- if ((mask & 0xf0000000) == 0) {
- int tmp = repair_avx_inner(_mm256_extract_epi32(new, 7), arr, search);
- if (tmp) {
- return tmp * arr[i] * arr[j + 7];
- }
+ case 0:
+ {
+ __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(7)));
+ int tmp = repair_avx_inner(arr, cmp);
+ if (tmp) {
+ return tmp * arr[i] * arr[j + 7];
+ }
+ }
+ case 1:
+ if ((mask & 0x0f000000) == 0) {
+ __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(6)));
+ int tmp = repair_avx_inner(arr, cmp);
+ if (tmp) {
+ return tmp * arr[i] * arr[j + 6];
+ }
+ }
+ case 2:
+ if ((mask & 0x00f00000) == 0 ){
+ __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(5)));
+ int tmp = repair_avx_inner(arr, cmp);
+ if (tmp) {
+ return tmp * arr[i] * arr[j + 5];
+ }
+ }
+ case 3:
+ if ((mask & 0x000f0000) == 0) {
+ __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(4)));
+ int tmp = repair_avx_inner(arr, cmp);
+ if (tmp) {
+ return tmp * arr[i] * arr[j + 4];
+ }
+ }
+ case 4:
+ if ((mask & 0x0000f000) == 0) {
+ __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(3)));
+ int tmp = repair_avx_inner(arr, cmp);
+ if (tmp) {
+ return tmp * arr[i] * arr[j + 3];
+ }
+ }
+ case 5:
+ if ((mask & 0x00000f00) == 0) {
+ __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(2)));
+ int tmp = repair_avx_inner(arr, cmp);
+ if (tmp) {
+ return tmp * arr[i] * arr[j + 2];
+ }
+ }
+ case 6:
+ if ((mask & 0x000000f0) == 0) {
+ __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(1)));
+ int tmp = repair_avx_inner(arr, cmp);
+ if (tmp) {
+ return tmp * arr[i] * arr[j + 1];
+ }
+ }
+ case 7:
+ if ((mask & 0x0000000f) == 0) {
+ __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_setzero_si256()));
+ int tmp = repair_avx_inner(arr, cmp);
+ if (tmp) {
+ return tmp * arr[i] * arr[j];
+ }
+ }
}
}
}