2 files changed, 65 insertions, 54 deletions
diff --git a/src/Data/Array/Mixed/Internal/Arith.hs b/src/Data/Array/Mixed/Internal/Arith.hs
index 9f99c3b..fc26633 100644
--- a/src/Data/Array/Mixed/Internal/Arith.hs
+++ b/src/Data/Array/Mixed/Internal/Arith.hs
@@ -31,6 +31,7 @@ import System.IO.Unsafe
 
 import Data.Array.Mixed.Internal.Arith.Foreign
 import Data.Array.Mixed.Internal.Arith.Lists
+import Data.Array.Mixed.Types (fromSNat')
 
 
 -- TODO: need to sort strides for reduction-like functions so that the C inner-loop specialisation has some chance of working even after transposition
@@ -304,36 +305,44 @@ vectorExtremumOp ptrconv fextrem (RS.A (RG.A sh (OI.T strides offset vec)))
              . VS.toList
              <$> VS.unsafeFreeze outvR
 
-vectorDotprodOp :: (Num a, Storable a)
-                => (b -> a)
-                -> (Ptr a -> Ptr b)
-                -> (Int64 -> Ptr b -> Ptr Int64 -> Ptr Int64 -> Ptr b -> IO ())  -- ^ reduction kernel
-                -> (Int64 -> Ptr b -> Ptr b -> IO b)  -- ^ dotprod kernel
-                -> (Int64 -> Int64 -> Int64 -> Ptr b -> Int64 -> Int64 -> Ptr b -> IO b)  -- ^ strided dotprod kernel
-                -> RS.Array 1 a -> RS.Array 1 a -> a
-vectorDotprodOp valbackconv ptrconv fred fdot fdotstrided
-    (RS.A (RG.A [len1] (OI.T [stride1] offset1 vec1)))
-    (RS.A (RG.A [len2] (OI.T [stride2] offset2 vec2)))
-  | len1 /= len2 = error $ "vectorDotprodOp: lengths unequal: " ++ show len1 ++ " vs " ++ show len2
-  | len1 == 0 = 0  -- if the arrays are empty, just return zero
-  | otherwise = case (stride1, stride2) of
-      (0, 0) ->  -- replicated scalar * replicated scalar
-        fromIntegral len1 * (vec1 VS.! offset1) * (vec2 VS.! offset2)
-      (0, 1) ->  -- replicated scalar * dense
-        dotScalarVector len1 ptrconv fred (vec1 VS.! offset1) (VS.slice offset2 len1 vec2)
-      (0, -1) ->  -- replicated scalar * reversed dense
-        dotScalarVector len1 ptrconv fred (vec1 VS.! offset1) (VS.slice (offset2 - (len1 - 1)) len1 vec2)
-      (1, 0) ->  -- dense * replicated scalar
-        dotScalarVector len1 ptrconv fred (vec2 VS.! offset2) (VS.slice offset1 len1 vec1)
-      (-1, 0) ->  -- reversed dense * replicated scalar
-        dotScalarVector len1 ptrconv fred (vec2 VS.! offset2) (VS.slice (offset1 - (len1 - 1)) len1 vec1)
-      (1, 1) ->  -- dense * dense
-        dotVectorVector len1 valbackconv ptrconv fdot (VS.slice offset1 len1 vec1) (VS.slice offset2 len1 vec2)
-      (-1, -1) ->  -- reversed dense * reversed dense
-        dotVectorVector len1 valbackconv ptrconv fdot (VS.slice (offset1 - (len1 - 1)) len1 vec1) (VS.slice (offset2 - (len1 - 1)) len1 vec2)
-      (_, _) ->  -- fallback case
-        dotVectorVectorStrided len1 valbackconv ptrconv fdotstrided offset1 stride1 vec1 offset2 stride2 vec2
-vectorDotprodOp _ _ _ _ _ _ _ = error "vectorDotprodOp: not one-dimensional?"
+vectorDotprodInnerOp :: forall a b n. (Num a, Storable a)
+                     => SNat n
+                     -> (a -> b)
+                     -> (Ptr a -> Ptr b)
+                     -> (SNat n -> RS.Array n a -> RS.Array n a -> RS.Array n a)  -- ^ elementwise multiplication
+                     -> (Int64 -> Ptr b -> b -> Ptr b -> IO ())  -- ^ scale by constant
+                     -> (Int64 -> Ptr b -> Ptr Int64 -> Ptr Int64 -> Ptr b -> IO ())  -- ^ reduction kernel
+                     -> (Int64 -> Ptr Int64 -> Ptr b -> Ptr Int64 -> Ptr b -> Ptr Int64 -> Ptr b -> IO ())  -- ^ dotprod kernel
+                     -> RS.Array (n + 1) a -> RS.Array (n + 1) a -> RS.Array n a
+vectorDotprodInnerOp sn@SNat valconv ptrconv fmul fscale fred fdotinner
+    arr1@(RS.A (RG.A sh1 (OI.T strides1 offset1 vec1)))
+    arr2@(RS.A (RG.A sh2 (OI.T strides2 offset2 vec2)))
+  | null sh1 || null sh2 = error "unreachable"
+  | sh1 /= sh2 = error $ "vectorDotprodInnerOp: shapes unequal: " ++ show sh1 ++ " vs " ++ show sh2
+  | last sh1 <= 0 = RS.stretch (init sh1) (RS.fromList (1 <$ init sh1) [0])
+  | any (<= 0) (init sh1) = RS.A (RG.A (init sh1) (OI.T (0 <$ init strides1) 0 VS.empty))
+  -- now the input arrays are nonempty
+  | last sh1 == 1 = fmul sn (RS.reshape (init sh1) arr1) (RS.reshape (init sh1) arr2)
+  | last strides1 == 0 =
+      fmul sn
+        (RS.A (RG.A (init sh1) (OI.T (init strides1) offset1 vec1)))
+        (vectorRedInnerOp sn valconv ptrconv fscale fred arr2)
+  | last strides2 == 0 =
+      fmul sn
+        (vectorRedInnerOp sn valconv ptrconv fscale fred arr1)
+        (RS.A (RG.A (init sh2) (OI.T (init strides2) offset2 vec2)))
+  -- now there is useful dotprod work along the inner dimension
+  | otherwise = unsafePerformIO $ do
+      let inrank = fromSNat' sn + 1
+      outv <- VSM.unsafeNew (product (init sh1))
+      VSM.unsafeWith outv $ \poutv ->
+        VS.unsafeWith (VS.fromListN inrank (map fromIntegral sh1)) $ \psh ->
+        VS.unsafeWith (VS.fromListN inrank (map fromIntegral strides1)) $ \pstrides1 ->
+        VS.unsafeWith vec1 $ \pvec1 ->
+        VS.unsafeWith (VS.fromListN inrank (map fromIntegral strides2)) $ \pstrides2 ->
+        VS.unsafeWith vec2 $ \pvec2 ->
+          fdotinner (fromIntegral @Int @Int64 inrank) psh (ptrconv poutv) pstrides1 (ptrconv pvec1) pstrides2 (ptrconv pvec2)
+      RS.fromVector @_ @n (init sh1) <$> VS.unsafeFreeze outv
 
 {-# NOINLINE dotScalarVector #-}
 dotScalarVector :: forall a b. (Num a, Storable a)
@@ -461,13 +470,14 @@ $(fmap concat . forM typesList $ \arithtype ->
 
 $(fmap concat . forM typesList $ \arithtype -> do
     let ttyp = conT (atType arithtype)
-        name = mkName ("dotprodVector" ++ nameBase (atType arithtype))
-        c_op = varE (mkName ("c_dotprod_" ++ atCName arithtype))
-        c_op_strided = varE (mkName ("c_dotprod_" ++ atCName arithtype ++ "_strided"))
+        name = mkName ("dotprodinnerVector" ++ nameBase (atType arithtype))
+        c_op = varE (mkName ("c_dotprodinner_" ++ atCName arithtype))
+        mul_op = varE (mkName ("mulVector" ++ nameBase (atType arithtype)))
+        c_scale_op = varE (mkName ("c_binary_" ++ atCName arithtype ++ "_sv")) `appE` litE (integerL (fromIntegral (aboEnum BO_MUL)))
         c_red_op = varE (mkName ("c_reduce1_" ++ atCName arithtype)) `appE` litE (integerL (fromIntegral (aroEnum RO_SUM)))
     sequence [SigD name <$>
-                   [t| RS.Array 1 $ttyp -> RS.Array 1 $ttyp -> $ttyp |]
-             ,do body <- [| vectorDotprodOp id id $c_red_op $c_op $c_op_strided |]
+                   [t| forall n. SNat n -> RS.Array (n + 1) $ttyp -> RS.Array (n + 1) $ttyp -> RS.Array n $ttyp |]
+             ,do body <- [| \sn -> vectorDotprodInnerOp sn id id $mul_op $c_scale_op $c_red_op $c_op |]
                  return $ FunD name [Clause [] (NormalB body) []]])
 
 -- This branch is ostensibly a runtime branch, but will (hopefully) be
@@ -533,19 +543,19 @@ intWidBranchExtr fextr32 fextr64
   | finiteBitSize (undefined :: i) == 64 = vectorExtremumOp @i @Int64 castPtr fextr64
   | otherwise = error "Unsupported Int width"
 
-intWidBranchDotprod :: forall i. (FiniteBits i, Storable i, Integral i)
+intWidBranchDotprod :: forall i n. (FiniteBits i, Storable i, Integral i, NumElt i)
                     => -- int32
-                       (Int64 -> Ptr Int32 -> Ptr Int64 -> Ptr Int64 -> Ptr Int32 -> IO ())  -- ^ reduction kernel
-                    -> (Int64 -> Ptr Int32 -> Ptr Int32 -> IO Int32)  -- ^ dotprod kernel
-                    -> (Int64 -> Int64 -> Int64 -> Ptr Int32 -> Int64 -> Int64 -> Ptr Int32 -> IO Int32)  -- ^ strided dotprod kernel
+                       (Int64 -> Ptr Int32 -> Int32 -> Ptr Int32 -> IO ())  -- ^ scale by constant
+                    -> (Int64 -> Ptr Int32 -> Ptr Int64 -> Ptr Int64 -> Ptr Int32 -> IO ())  -- ^ reduction kernel
+                    -> (Int64 -> Ptr Int64 -> Ptr Int32 -> Ptr Int64 -> Ptr Int32 -> Ptr Int64 -> Ptr Int32 -> IO ())  -- ^ dotprod kernel
                        -- int64
+                    -> (Int64 -> Ptr Int64 -> Int64 -> Ptr Int64 -> IO ())  -- ^ scale by constant
                     -> (Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> IO ())  -- ^ reduction kernel
-                    -> (Int64 -> Ptr Int64 -> Ptr Int64 -> IO Int64)  -- ^ dotprod kernel
-                    -> (Int64 -> Int64 -> Int64 -> Ptr Int64 -> Int64 -> Int64 -> Ptr Int64 -> IO Int64)  -- ^ strided dotprod kernel
-                    -> (RS.Array 1 i -> RS.Array 1 i -> i)
-intWidBranchDotprod fred32 fdot32 fdot32strided fred64 fdot64 fdot64strided
-  | finiteBitSize (undefined :: i) == 32 = vectorDotprodOp @i @Int32 fromIntegral castPtr fred32 fdot32 fdot32strided
-  | finiteBitSize (undefined :: i) == 64 = vectorDotprodOp @i @Int64 fromIntegral castPtr fred64 fdot64 fdot64strided
+                    -> (Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> IO ())  -- ^ dotprod kernel
+                    -> (SNat n -> RS.Array (n + 1) i -> RS.Array (n + 1) i -> RS.Array n i)
+intWidBranchDotprod fsc32 fred32 fdot32 fsc64 fred64 fdot64 sn
+  | finiteBitSize (undefined :: i) == 32 = vectorDotprodInnerOp @i @Int32 sn fromIntegral castPtr numEltMul fsc32 fred32 fdot32
+  | finiteBitSize (undefined :: i) == 64 = vectorDotprodInnerOp @i @Int64 sn fromIntegral castPtr numEltMul fsc64 fred64 fdot64
   | otherwise = error "Unsupported Int width"
 
 class NumElt a where
@@ -561,7 +571,7 @@ class NumElt a where
   numEltProductFull :: SNat n -> RS.Array n a -> a
   numEltMinIndex :: RS.Array n a -> [Int]
   numEltMaxIndex :: RS.Array n a -> [Int]
-  numEltDotprod :: RS.Array 1 a -> RS.Array 1 a -> a
+  numEltDotprodInner :: SNat n -> RS.Array (n + 1) a -> RS.Array (n + 1) a -> RS.Array n a
 
 instance NumElt Int32 where
   numEltAdd = addVectorInt32
@@ -576,7 +586,7 @@ instance NumElt Int32 where
   numEltProductFull = productFullVectorInt32
   numEltMinIndex = minindexVectorInt32
   numEltMaxIndex = maxindexVectorInt32
-  numEltDotprod = dotprodVectorInt32
+  numEltDotprodInner = dotprodinnerVectorInt32
 
 instance NumElt Int64 where
   numEltAdd = addVectorInt64
@@ -591,7 +601,7 @@ instance NumElt Int64 where
   numEltProductFull = productFullVectorInt64
   numEltMinIndex = minindexVectorInt64
   numEltMaxIndex = maxindexVectorInt64
-  numEltDotprod = dotprodVectorInt64
+  numEltDotprodInner = dotprodinnerVectorInt64
 
 instance NumElt Float where
   numEltAdd = addVectorFloat
@@ -606,7 +616,7 @@ instance NumElt Float where
   numEltProductFull = productFullVectorFloat
   numEltMinIndex = minindexVectorFloat
   numEltMaxIndex = maxindexVectorFloat
-  numEltDotprod = dotprodVectorFloat
+  numEltDotprodInner = dotprodinnerVectorFloat
 
 instance NumElt Double where
   numEltAdd = addVectorDouble
@@ -621,7 +631,7 @@ instance NumElt Double where
   numEltProductFull = productFullVectorDouble
   numEltMinIndex = minindexVectorDouble
   numEltMaxIndex = maxindexVectorDouble
-  numEltDotprod = dotprodVectorDouble
+  numEltDotprodInner = dotprodinnerVectorDouble
 
 instance NumElt Int where
   numEltAdd = intWidBranch2 @Int (+)
@@ -646,8 +656,8 @@ instance NumElt Int where
   numEltProductFull = intWidBranchRedFull @Int (^) (c_reducefull_i32 (aroEnum RO_PRODUCT)) (c_reducefull_i64 (aroEnum RO_PRODUCT))
   numEltMinIndex = intWidBranchExtr @Int c_extremum_min_i32 c_extremum_min_i64
   numEltMaxIndex = intWidBranchExtr @Int c_extremum_max_i32 c_extremum_max_i64
-  numEltDotprod = intWidBranchDotprod @Int (c_reduce1_i32 (aroEnum RO_SUM)) c_dotprod_i32 c_dotprod_i32_strided
-                                           (c_reduce1_i64 (aroEnum RO_SUM)) c_dotprod_i64 c_dotprod_i64_strided
+  numEltDotprodInner = intWidBranchDotprod @Int (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce1_i32 (aroEnum RO_SUM)) c_dotprodinner_i32
+                                                (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce1_i64 (aroEnum RO_SUM)) c_dotprodinner_i64
 
 instance NumElt CInt where
   numEltAdd = intWidBranch2 @CInt (+)
@@ -672,8 +682,8 @@ instance NumElt CInt where
   numEltProductFull = intWidBranchRedFull @CInt (^) (c_reducefull_i32 (aroEnum RO_PRODUCT)) (c_reducefull_i64 (aroEnum RO_PRODUCT))
   numEltMinIndex = intWidBranchExtr @CInt c_extremum_min_i32 c_extremum_min_i64
   numEltMaxIndex = intWidBranchExtr @CInt c_extremum_max_i32 c_extremum_max_i64
-  numEltDotprod = intWidBranchDotprod @CInt (c_reduce1_i32 (aroEnum RO_SUM)) c_dotprod_i32 c_dotprod_i32_strided
-                                            (c_reduce1_i64 (aroEnum RO_SUM)) c_dotprod_i64 c_dotprod_i64_strided
+  numEltDotprodInner = intWidBranchDotprod @CInt (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce1_i32 (aroEnum RO_SUM)) c_dotprodinner_i32
+                                                 (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce1_i64 (aroEnum RO_SUM)) c_dotprodinner_i64
 
 class FloatElt a where
   floatEltDiv :: SNat n -> RS.Array n a -> RS.Array n a -> RS.Array n a
diff --git a/src/Data/Array/Mixed/Internal/Arith/Foreign.hs b/src/Data/Array/Mixed/Internal/Arith/Foreign.hs
index c1c0070..ade7ce1 100644
--- a/src/Data/Array/Mixed/Internal/Arith/Foreign.hs
+++ b/src/Data/Array/Mixed/Internal/Arith/Foreign.hs
@@ -22,6 +22,7 @@ $(do
         ,("extremum_max_" ++ tyn,          [t| Ptr Int64 -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> IO () |])
         ,("dotprod_" ++ tyn,               [t| Int64 -> Ptr $ttyp -> Ptr $ttyp -> IO $ttyp |])
         ,("dotprod_" ++ tyn ++ "_strided", [t| Int64 -> Int64 -> Int64 -> Ptr $ttyp -> Int64 -> Int64 -> Ptr $ttyp -> IO $ttyp |])
+        ,("dotprodinner_" ++ tyn,          [t| Int64 -> Ptr Int64 -> Ptr $ttyp -> Ptr Int64 -> Ptr $ttyp -> Ptr Int64 -> Ptr $ttyp -> IO () |])
         ]
 
   let importsFloat ttyp tyn =