Only use intel SIMD on intel platforms

author: Tom Smeding <tom@tomsmeding.com> 2024-06-12 22:07:25 +0200
committer: Tom Smeding <tom@tomsmeding.com> 2024-06-12 22:07:25 +0200
commit: a088130c3e722d3c589be388a98daab28a73b23f (patch)
tree: 7e84255479e5df4cbd830ff939cdbd7cd9899e21 /cbits
parent: 39e84802d630ba2ce7e1d51641e39982d6091511 (diff)
1 files changed, 13 insertions, 0 deletions
diff --git a/cbits/arith.c b/cbits/arith.c
index 73bf99f..fb993c8 100644
--- a/cbits/arith.c
+++ b/cbits/arith.c
@@ -1,10 +1,18 @@
+// Architecture detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define OX_ARCH_INTEL
+#endif
+
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <math.h>
+
+#ifdef OX_ARCH_INTEL
 #include <emmintrin.h>
+#endif
 
 // These are the wrapper macros used in arith_lists.h. Preset them to empty to
 // avoid having to touch macros unrelated to the particular operation set below.
@@ -218,6 +226,7 @@ static double log1pexp_double(double x) { LOG1PEXP_IMPL(x); }
 // The 'double' version here is about 2x as fast as gcc's own vectorisation.
 DOTPROD_OP(i32)
 DOTPROD_OP(i64)
+#ifdef OX_ARCH_INTEL
 float oxarop_dotprod_float(i64 length, const float *arr1, const float *arr2) {
   __m128 accum = _mm_setzero_ps();
   i64 i;
@@ -240,6 +249,10 @@ double oxarop_dotprod_double(i64 length, const double *arr1, const double *arr2)
   if (i < length) tot += arr1[i] * arr2[i];
   return tot;
 }
+#else
+DOTPROD_OP(float)
+DOTPROD_OP(double)
+#endif
 
 
 /*****************************************************************************
author	Tom Smeding <tom@tomsmeding.com>	2024-06-12 22:07:25 +0200
committer	Tom Smeding <tom@tomsmeding.com>	2024-06-12 22:07:25 +0200
commit	a088130c3e722d3c589be388a98daab28a73b23f (patch)
tree	7e84255479e5df4cbd830ff939cdbd7cd9899e21 /cbits
parent	39e84802d630ba2ce7e1d51641e39982d6091511 (diff)