From 6276ed3c7bcd20c8b860e1275386ecd068671bcc Mon Sep 17 00:00:00 2001
From: Tom Smeding <tom@tomsmeding.com>
Date: Fri, 14 Mar 2025 21:57:56 +0100
Subject: Optimise reductions and dotprod with more vectorisation

Turns out that if you don't supply -ffast-math, the C compiler will
faithfully reproduce your linear reduction order, which is rather
disastrous for parallelisation with vector units.

This changes the summation order, so numerical results might differ
slightly. To wit: the test suite needed adjustment.
---
 src/Data/Array/Mixed/Internal/Arith/Foreign.hs | 1 -
 1 file changed, 1 deletion(-)

(limited to 'src/Data/Array/Mixed/Internal/Arith/Foreign.hs')

diff --git a/src/Data/Array/Mixed/Internal/Arith/Foreign.hs b/src/Data/Array/Mixed/Internal/Arith/Foreign.hs
index 15fbc79..969a25a 100644
--- a/src/Data/Array/Mixed/Internal/Arith/Foreign.hs
+++ b/src/Data/Array/Mixed/Internal/Arith/Foreign.hs
@@ -20,7 +20,6 @@ $(do
         ,("reducefull_" ++ tyn,              [t| CInt -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> IO $ttyp |])
         ,("extremum_min_" ++ tyn,            [t| Ptr Int64 -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> IO () |])
         ,("extremum_max_" ++ tyn,            [t| Ptr Int64 -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> IO () |])
-        ,("dotprod_" ++ tyn,                 [t| Int64 -> Ptr $ttyp -> Ptr $ttyp -> IO $ttyp |])
         ,("dotprod_" ++ tyn ++ "_strided",   [t| Int64 -> Int64 -> Int64 -> Ptr $ttyp -> Int64 -> Int64 -> Ptr $ttyp -> IO $ttyp |])
         ,("dotprodinner_" ++ tyn,            [t| Int64 -> Ptr Int64 -> Ptr $ttyp -> Ptr Int64 -> Ptr $ttyp -> Ptr Int64 -> Ptr $ttyp -> IO () |])
         ]
-- 
cgit v1.2.3-70-g09d2