diff options
author | Tom Smeding <tom@tomsmeding.com> | 2024-06-11 13:25:05 +0200 |
---|---|---|
committer | Tom Smeding <tom@tomsmeding.com> | 2024-06-11 13:25:05 +0200 |
commit | bb8089e4aa0aa0aeb4ec42f2e1f0e33d0414e1cf (patch) | |
tree | dc6123b54447d04e8b6a587cd0be7449c83090af | |
parent | 42b8c69a978b54001aeae62c8c37ce80500d6428 (diff) |
Fix SIMD code to allow for unaligned arrays
-rw-r--r-- | cbits/arith.c | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/cbits/arith.c b/cbits/arith.c index d487cfd..73bf99f 100644 --- a/cbits/arith.c +++ b/cbits/arith.c @@ -222,7 +222,7 @@ float oxarop_dotprod_float(i64 length, const float *arr1, const float *arr2) { __m128 accum = _mm_setzero_ps(); i64 i; for (i = 0; i + 3 < length; i += 4) { - accum = _mm_add_ps(accum, _mm_mul_ps(_mm_load_ps(arr1 + i), _mm_load_ps(arr2 + i))); + accum = _mm_add_ps(accum, _mm_mul_ps(_mm_loadu_ps(arr1 + i), _mm_loadu_ps(arr2 + i))); } float dest[4]; _mm_storeu_ps(dest, accum); @@ -234,7 +234,7 @@ double oxarop_dotprod_double(i64 length, const double *arr1, const double *arr2) __m128d accum = _mm_setzero_pd(); i64 i; for (i = 0; i + 1 < length; i += 2) { - accum = _mm_add_pd(accum, _mm_mul_pd(_mm_load_pd(arr1 + i), _mm_load_pd(arr2 + i))); + accum = _mm_add_pd(accum, _mm_mul_pd(_mm_loadu_pd(arr1 + i), _mm_loadu_pd(arr2 + i))); } double tot = _mm_cvtsd_f64(accum) + _mm_cvtsd_f64(_mm_unpackhi_pd(accum, accum)); if (i < length) tot += arr1[i] * arr2[i]; |