summaryrefslogtreecommitdiff
path: root/1/part2_fast.c
blob: c32f88148ef93387b60a0bba8ea25703f30de190 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>

#define INPUT_LEN 200
#define SEARCH 2020

#ifdef USE_ASM
int repair_avx_inner(const int *arr, __m256i search);
#else
int repair_avx_inner(const int *arr, __m256i search)
{
    for (int k = 0; k < INPUT_LEN; k += 8) {
        __m256i new = _mm256_loadu_si256((__m256i *)(&arr[k]));
        if (_mm256_movemask_epi8(_mm256_cmpeq_epi32(new, search))) {
            return _mm256_extract_epi32(search, 0);
        }
    }

    return 0;
}
#endif

int repair_avx(const int *arr)
{
    __m256i search = _mm256_set1_epi32(SEARCH);
    for (int i = 0; i < INPUT_LEN; i++) {
        __m256i start = _mm256_set1_epi32(arr[i]);
        for (int j = 0; j < INPUT_LEN; j += 8) {
            __m256i new = _mm256_loadu_si256((__m256i *)(&arr[j]));
            new = _mm256_add_epi32(start, new);
            unsigned int mask = (unsigned int)_mm256_movemask_epi8(_mm256_cmpgt_epi32(new, search));
            if (mask == 0xffffffff) {
                continue;
            }

            switch (__lzcnt32(mask) / 4) {
                case 0:
                    {
                        __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(7)));
                        int tmp = repair_avx_inner(arr, cmp);
                        if (tmp) {
                            return tmp * arr[i] * arr[j + 7];
                        }
                    }
                case 1:
                    if ((mask & 0x0f000000) == 0) {
                        __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(6)));
                        int tmp = repair_avx_inner(arr, cmp);
                        if (tmp) {
                            return tmp * arr[i] * arr[j + 6];
                        }
                    }
                case 2:
                    if ((mask & 0x00f00000) == 0 ){
                        __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(5)));
                        int tmp = repair_avx_inner(arr, cmp);
                        if (tmp) {
                            return tmp * arr[i] * arr[j + 5];
                        }
                    }
                case 3:
                    if ((mask & 0x000f0000) == 0) {
                        __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(4)));
                        int tmp = repair_avx_inner(arr, cmp);
                        if (tmp) {
                            return tmp * arr[i] * arr[j + 4];
                        }
                    }
                case 4:
                    if ((mask & 0x0000f000) == 0) {
                        __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(3)));
                        int tmp = repair_avx_inner(arr, cmp);
                        if (tmp) {
                            return tmp * arr[i] * arr[j + 3];
                        }
                    }
                case 5:
                    if ((mask & 0x00000f00) == 0) {
                        __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(2)));
                        int tmp = repair_avx_inner(arr, cmp);
                        if (tmp) {
                            return tmp * arr[i] * arr[j + 2];
                        }
                    }
                case 6:
                    if ((mask & 0x000000f0) == 0) {
                        __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_set1_epi32(1)));
                        int tmp = repair_avx_inner(arr, cmp);
                        if (tmp) {
                            return tmp * arr[i] * arr[j + 1];
                        }
                    }
                case 7:
                    if ((mask & 0x0000000f) == 0) {
                        __m256i cmp = _mm256_sub_epi32(search, _mm256_permutevar8x32_epi32(new, _mm256_setzero_si256()));
                        int tmp = repair_avx_inner(arr, cmp);
                        if (tmp) {
                            return tmp * arr[i] * arr[j];
                        }
                    }
            }
        }
    }

    return 0;
}

int main(int argc, char *argv[])
{
    FILE *file = fopen(argv[argc - 1], "r");
    if (!file) {
        return 1;
    }

    char buffer[8] = { 0 };
    int input[INPUT_LEN] = { 0 };
    for (int i = 0; i < 200; i++) {
        fgets(buffer, 8, file);
        input[i] = atoi(buffer);
    }

    fclose(file);

    printf("%i\n", repair_avx(input));
}