diff options
-rw-r--r-- | cbits/arith.c | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/cbits/arith.c b/cbits/arith.c index d487cfd..73bf99f 100644 --- a/cbits/arith.c +++ b/cbits/arith.c @@ -222,7 +222,7 @@ float oxarop_dotprod_float(i64 length, const float *arr1, const float *arr2) { __m128 accum = _mm_setzero_ps(); i64 i; for (i = 0; i + 3 < length; i += 4) { - accum = _mm_add_ps(accum, _mm_mul_ps(_mm_load_ps(arr1 + i), _mm_load_ps(arr2 + i))); + accum = _mm_add_ps(accum, _mm_mul_ps(_mm_loadu_ps(arr1 + i), _mm_loadu_ps(arr2 + i))); } float dest[4]; _mm_storeu_ps(dest, accum); @@ -234,7 +234,7 @@ double oxarop_dotprod_double(i64 length, const double *arr1, const double *arr2) __m128d accum = _mm_setzero_pd(); i64 i; for (i = 0; i + 1 < length; i += 2) { - accum = _mm_add_pd(accum, _mm_mul_pd(_mm_load_pd(arr1 + i), _mm_load_pd(arr2 + i))); + accum = _mm_add_pd(accum, _mm_mul_pd(_mm_loadu_pd(arr1 + i), _mm_loadu_pd(arr2 + i))); } double tot = _mm_cvtsd_f64(accum) + _mm_cvtsd_f64(_mm_unpackhi_pd(accum, accum)); if (i < length) tot += arr1[i] * arr2[i]; |