aboutsummaryrefslogtreecommitdiff
path: root/cbits
diff options
context:
space:
mode:
authorTom Smeding <tom@tomsmeding.com>2024-06-11 13:25:05 +0200
committerTom Smeding <tom@tomsmeding.com>2024-06-11 13:25:05 +0200
commitbb8089e4aa0aa0aeb4ec42f2e1f0e33d0414e1cf (patch)
treedc6123b54447d04e8b6a587cd0be7449c83090af /cbits
parent42b8c69a978b54001aeae62c8c37ce80500d6428 (diff)
Fix SIMD code to allow for unaligned arrays
Diffstat (limited to 'cbits')
-rw-r--r--cbits/arith.c4
1 files changed, 2 insertions, 2 deletions
diff --git a/cbits/arith.c b/cbits/arith.c
index d487cfd..73bf99f 100644
--- a/cbits/arith.c
+++ b/cbits/arith.c
@@ -222,7 +222,7 @@ float oxarop_dotprod_float(i64 length, const float *arr1, const float *arr2) {
__m128 accum = _mm_setzero_ps();
i64 i;
for (i = 0; i + 3 < length; i += 4) {
- accum = _mm_add_ps(accum, _mm_mul_ps(_mm_load_ps(arr1 + i), _mm_load_ps(arr2 + i)));
+ accum = _mm_add_ps(accum, _mm_mul_ps(_mm_loadu_ps(arr1 + i), _mm_loadu_ps(arr2 + i)));
}
float dest[4];
_mm_storeu_ps(dest, accum);
@@ -234,7 +234,7 @@ double oxarop_dotprod_double(i64 length, const double *arr1, const double *arr2)
__m128d accum = _mm_setzero_pd();
i64 i;
for (i = 0; i + 1 < length; i += 2) {
- accum = _mm_add_pd(accum, _mm_mul_pd(_mm_load_pd(arr1 + i), _mm_load_pd(arr2 + i)));
+ accum = _mm_add_pd(accum, _mm_mul_pd(_mm_loadu_pd(arr1 + i), _mm_loadu_pd(arr2 + i)));
}
double tot = _mm_cvtsd_f64(accum) + _mm_cvtsd_f64(_mm_unpackhi_pd(accum, accum));
if (i < length) tot += arr1[i] * arr2[i];