summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--1/part2_fast.c23
-rw-r--r--1/repair_avx.asm29
2 files changed, 43 insertions, 9 deletions
diff --git a/1/part2_fast.c b/1/part2_fast.c
index 4523c65..ffb4504 100644
--- a/1/part2_fast.c
+++ b/1/part2_fast.c
@@ -7,7 +7,10 @@
#define INPUT_LEN 200
#define SEARCH 2020
-int inner_repair_avx(int i, const int *arr, __m256i search)
+#ifdef USE_ASM
+int repair_avx_inner(int i, const int *arr, __m256i search);
+#else
+int repair_avx_inner(int i, const int *arr, __m256i search)
{
__m256i start = _mm256_set1_epi32(i);
for (int k = 0; k < INPUT_LEN; k += 8) {
@@ -27,6 +30,7 @@ int inner_repair_avx(int i, const int *arr, __m256i search)
return 0;
}
+#endif
int repair_avx(const int *arr)
{
@@ -54,7 +58,7 @@ int repair_avx(const int *arr)
label0:
if ((mask & 0x0000000f) == 0) {
- int tmp = inner_repair_avx(_mm256_extract_epi32(new, 0), arr, search);
+ int tmp = repair_avx_inner(_mm256_extract_epi32(new, 0), arr, search);
if (tmp) {
return tmp * arr[i] * arr[j];
}
@@ -62,7 +66,7 @@ int repair_avx(const int *arr)
label1:
if ((mask & 0x000000f0) == 0) {
- int tmp = inner_repair_avx(_mm256_extract_epi32(new, 1), arr, search);
+ int tmp = repair_avx_inner(_mm256_extract_epi32(new, 1), arr, search);
if (tmp) {
return tmp * arr[i] * arr[j + 1];
}
@@ -70,7 +74,7 @@ int repair_avx(const int *arr)
label2:
if ((mask & 0x00000f00) == 0) {
- int tmp = inner_repair_avx(_mm256_extract_epi32(new, 2), arr, search);
+ int tmp = repair_avx_inner(_mm256_extract_epi32(new, 2), arr, search);
if (tmp) {
return tmp * arr[i] * arr[j + 2];
}
@@ -78,7 +82,7 @@ int repair_avx(const int *arr)
label3:
if ((mask & 0x0000f000) == 0) {
- int tmp = inner_repair_avx(_mm256_extract_epi32(new, 3), arr, search);
+ int tmp = repair_avx_inner(_mm256_extract_epi32(new, 3), arr, search);
if (tmp) {
return tmp * arr[i] * arr[j + 3];
}
@@ -86,7 +90,7 @@ int repair_avx(const int *arr)
label4:
if ((mask & 0x000f0000) == 0) {
- int tmp = inner_repair_avx(_mm256_extract_epi32(new, 4), arr, search);
+ int tmp = repair_avx_inner(_mm256_extract_epi32(new, 4), arr, search);
if (tmp) {
return tmp * arr[i] * arr[j + 4];
}
@@ -94,7 +98,7 @@ int repair_avx(const int *arr)
label5:
if ((mask & 0x00f00000) == 0 ){
- int tmp = inner_repair_avx(_mm256_extract_epi32(new, 5), arr, search);
+ int tmp = repair_avx_inner(_mm256_extract_epi32(new, 5), arr, search);
if (tmp) {
return tmp * arr[i] * arr[j + 5];
}
@@ -102,7 +106,7 @@ int repair_avx(const int *arr)
label6:
if ((mask & 0x0f000000) == 0) {
- int tmp = inner_repair_avx(_mm256_extract_epi32(new, 6), arr, search);
+ int tmp = repair_avx_inner(_mm256_extract_epi32(new, 6), arr, search);
if (tmp) {
return tmp * arr[i] * arr[j + 6];
}
@@ -110,7 +114,7 @@ int repair_avx(const int *arr)
label7:
if ((mask & 0xf0000000) == 0) {
- int tmp = inner_repair_avx(_mm256_extract_epi32(new, 7), arr, search);
+ int tmp = repair_avx_inner(_mm256_extract_epi32(new, 7), arr, search);
if (tmp) {
return tmp * arr[i] * arr[j + 7];
}
@@ -121,6 +125,7 @@ int repair_avx(const int *arr)
return 0;
}
+
int main(int argc, char *argv[])
{
FILE *file = fopen(argv[argc - 1], "r");
diff --git a/1/repair_avx.asm b/1/repair_avx.asm
new file mode 100644
index 0000000..777e292
--- /dev/null
+++ b/1/repair_avx.asm
@@ -0,0 +1,29 @@
+global repair_avx_inner
+
+section .text
+
+repair_avx_inner:
+; vpbroadcastd ymm1, edi ; AVX512VL AVX512F
+ vmovd xmm1, edi
+ vpbroadcastd ymm1, xmm1
+%rep 24
+ vpaddd ymm2, ymm1, [rsi]
+ vpcmpeqd ymm2, ymm2, ymm0
+ vpmovmskb edx, ymm2
+ test edx, edx
+ jne .found
+ add rsi, 32 ; set up to read the next 256 bits (32 bytes) (8 * dword)
+%endrep
+ vpaddd ymm2, ymm1, [rsi]
+ vpcmpeqd ymm2, ymm2, ymm0
+ vpmovmskb edx, ymm2
+ test edx, edx
+ jne .found
+ xor eax, eax ; not found, return 0
+ vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions
+ ret
+.found:
+ bsf edx, edx
+ mov eax, dword [rsi + rdx]
+ vzeroupper ; eliminate performance penalties caused by false dependencies when transitioning between AVX and legacy SSE instructions
+ ret