diff options
Diffstat (limited to 'cbits')
| -rw-r--r-- | cbits/arith.c | 4 | 
1 files changed, 2 insertions, 2 deletions
diff --git a/cbits/arith.c b/cbits/arith.c index d487cfd..73bf99f 100644 --- a/cbits/arith.c +++ b/cbits/arith.c @@ -222,7 +222,7 @@ float oxarop_dotprod_float(i64 length, const float *arr1, const float *arr2) {    __m128 accum = _mm_setzero_ps();    i64 i;    for (i = 0; i + 3 < length; i += 4) { -    accum = _mm_add_ps(accum, _mm_mul_ps(_mm_load_ps(arr1 + i), _mm_load_ps(arr2 + i))); +    accum = _mm_add_ps(accum, _mm_mul_ps(_mm_loadu_ps(arr1 + i), _mm_loadu_ps(arr2 + i)));    }    float dest[4];    _mm_storeu_ps(dest, accum); @@ -234,7 +234,7 @@ double oxarop_dotprod_double(i64 length, const double *arr1, const double *arr2)    __m128d accum = _mm_setzero_pd();    i64 i;    for (i = 0; i + 1 < length; i += 2) { -    accum = _mm_add_pd(accum, _mm_mul_pd(_mm_load_pd(arr1 + i), _mm_load_pd(arr2 + i))); +    accum = _mm_add_pd(accum, _mm_mul_pd(_mm_loadu_pd(arr1 + i), _mm_loadu_pd(arr2 + i)));    }    double tot = _mm_cvtsd_f64(accum) + _mm_cvtsd_f64(_mm_unpackhi_pd(accum, accum));    if (i < length) tot += arr1[i] * arr2[i];  | 
