diff options
| author | Tom Smeding <tom@tomsmeding.com> | 2024-06-11 13:25:05 +0200 | 
|---|---|---|
| committer | Tom Smeding <tom@tomsmeding.com> | 2024-06-11 13:25:05 +0200 | 
| commit | bb8089e4aa0aa0aeb4ec42f2e1f0e33d0414e1cf (patch) | |
| tree | dc6123b54447d04e8b6a587cd0be7449c83090af | |
| parent | 42b8c69a978b54001aeae62c8c37ce80500d6428 (diff) | |
Fix SIMD code to allow for unaligned arrays
| -rw-r--r-- | cbits/arith.c | 4 | 
1 files changed, 2 insertions, 2 deletions
| diff --git a/cbits/arith.c b/cbits/arith.c index d487cfd..73bf99f 100644 --- a/cbits/arith.c +++ b/cbits/arith.c @@ -222,7 +222,7 @@ float oxarop_dotprod_float(i64 length, const float *arr1, const float *arr2) {    __m128 accum = _mm_setzero_ps();    i64 i;    for (i = 0; i + 3 < length; i += 4) { -    accum = _mm_add_ps(accum, _mm_mul_ps(_mm_load_ps(arr1 + i), _mm_load_ps(arr2 + i))); +    accum = _mm_add_ps(accum, _mm_mul_ps(_mm_loadu_ps(arr1 + i), _mm_loadu_ps(arr2 + i)));    }    float dest[4];    _mm_storeu_ps(dest, accum); @@ -234,7 +234,7 @@ double oxarop_dotprod_double(i64 length, const double *arr1, const double *arr2)    __m128d accum = _mm_setzero_pd();    i64 i;    for (i = 0; i + 1 < length; i += 2) { -    accum = _mm_add_pd(accum, _mm_mul_pd(_mm_load_pd(arr1 + i), _mm_load_pd(arr2 + i))); +    accum = _mm_add_pd(accum, _mm_mul_pd(_mm_loadu_pd(arr1 + i), _mm_loadu_pd(arr2 + i)));    }    double tot = _mm_cvtsd_f64(accum) + _mm_cvtsd_f64(_mm_unpackhi_pd(accum, accum));    if (i < length) tot += arr1[i] * arr2[i]; | 
