From 6276ed3c7bcd20c8b860e1275386ecd068671bcc Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Fri, 14 Mar 2025 21:57:56 +0100 Subject: Optimise reductions and dotprod with more vectorisation Turns out that if you don't supply -ffast-math, the C compiler will faithfully reproduce your linear reduction order, which is rather disastrous for parallelisation with vector units. This changes the summation order, so numerical results might differ slightly. To wit: the test suite needed adjustment. --- src/Data/Array/Mixed/Internal/Arith/Foreign.hs | 1 - 1 file changed, 1 deletion(-) (limited to 'src/Data/Array/Mixed/Internal/Arith/Foreign.hs') diff --git a/src/Data/Array/Mixed/Internal/Arith/Foreign.hs b/src/Data/Array/Mixed/Internal/Arith/Foreign.hs index 15fbc79..969a25a 100644 --- a/src/Data/Array/Mixed/Internal/Arith/Foreign.hs +++ b/src/Data/Array/Mixed/Internal/Arith/Foreign.hs @@ -20,7 +20,6 @@ $(do ,("reducefull_" ++ tyn, [t| CInt -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> IO $ttyp |]) ,("extremum_min_" ++ tyn, [t| Ptr Int64 -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> IO () |]) ,("extremum_max_" ++ tyn, [t| Ptr Int64 -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> IO () |]) - ,("dotprod_" ++ tyn, [t| Int64 -> Ptr $ttyp -> Ptr $ttyp -> IO $ttyp |]) ,("dotprod_" ++ tyn ++ "_strided", [t| Int64 -> Int64 -> Int64 -> Ptr $ttyp -> Int64 -> Int64 -> Ptr $ttyp -> IO $ttyp |]) ,("dotprodinner_" ++ tyn, [t| Int64 -> Ptr Int64 -> Ptr $ttyp -> Ptr Int64 -> Ptr $ttyp -> Ptr Int64 -> Ptr $ttyp -> IO () |]) ] -- cgit v1.2.3-70-g09d2