From 9b0651bf19e889dfb28ba81b6ada25b27b0e6071 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Mon, 17 Jun 2024 13:08:13 +0200 Subject: sumAllPrim --- cbits/arith.c | 49 +++++++- cbits/arith_lists.h | 4 +- src/Data/Array/Mixed/Internal/Arith.hs | 150 +++++++++++++++++++------ src/Data/Array/Mixed/Internal/Arith/Foreign.hs | 10 +- src/Data/Array/Mixed/XArray.hs | 4 +- src/Data/Array/Nested.hs | 6 +- src/Data/Array/Nested/Internal/Mixed.hs | 3 + src/Data/Array/Nested/Internal/Ranked.hs | 3 + src/Data/Array/Nested/Internal/Shaped.hs | 3 + 9 files changed, 184 insertions(+), 48 deletions(-) diff --git a/cbits/arith.c b/cbits/arith.c index fb993c8..5d74c01 100644 --- a/cbits/arith.c +++ b/cbits/arith.c @@ -173,6 +173,33 @@ static double log1pexp_double(double x) { LOG1PEXP_IMPL(x); } } \ } +// preconditions +// - all strides are >0 +// - shape is everywhere >0 +// - rank is >= 1 +#define REDUCEFULL_OP(name, op, typ) \ + typ oxarop_op_ ## name ## _ ## typ(i64 rank, const i64 *shape, const i64 *strides, const typ *arr) { \ + typ res = 0; \ + if (strides[rank - 1] == 1) { \ + TARRAY_WALK_NOINNER(again1, rank, shape, strides, { \ + typ accum = arr[arrlinidx]; \ + for (i64 i = 1; i < shape[rank - 1]; i++) { \ + accum = accum op arr[arrlinidx + i]; \ + } \ + res = res op accum; \ + }); \ + } else { \ + TARRAY_WALK_NOINNER(again2, rank, shape, strides, { \ + typ accum = arr[arrlinidx]; \ + for (i64 i = 1; i < shape[rank - 1]; i++) { \ + accum = accum op arr[arrlinidx + strides[rank - 1] * i]; \ + } \ + res = res op accum; \ + }); \ + } \ + return res; \ + } + // preconditions // - all strides are >0 // - shape is everywhere >0 @@ -394,11 +421,20 @@ enum redop_tag_t { #define LIST_REDOP(name, id, _) }; -#define ENTRY_REDUCE_OPS(typ) \ - void oxarop_reduce_ ## typ(enum redop_tag_t tag, i64 rank, const i64 *shape, const i64 *strides, typ *out, const typ *arr) { \ +#define ENTRY_REDUCE1_OPS(typ) \ + void oxarop_reduce1_ ## typ(enum redop_tag_t tag, i64 rank, const i64 *shape, const i64 *strides, typ *out, const typ *arr) { \ + switch (tag) { \ + case RO_SUM: oxarop_op_sum1_ ## typ(rank, shape, strides, out, arr); break; \ + case RO_PRODUCT: oxarop_op_product1_ ## typ(rank, shape, strides, out, arr); break; \ + default: wrong_op("reduce", tag); \ + } \ + } + +#define ENTRY_REDUCEFULL_OPS(typ) \ + typ oxarop_reducefull_ ## typ(enum redop_tag_t tag, i64 rank, const i64 *shape, const i64 *strides, const typ *arr) { \ switch (tag) { \ - case RO_SUM1: oxarop_op_sum1_ ## typ(rank, shape, strides, out, arr); break; \ - case RO_PRODUCT1: oxarop_op_product1_ ## typ(rank, shape, strides, out, arr); break; \ + case RO_SUM: return oxarop_op_sumfull_ ## typ(rank, shape, strides, arr); \ + case RO_PRODUCT: return oxarop_op_productfull_ ## typ(rank, shape, strides, arr); \ default: wrong_op("reduce", tag); \ } \ } @@ -420,9 +456,12 @@ enum redop_tag_t { UNARY_OP(signum, GEN_SIGNUM, typ) \ REDUCE1_OP(sum1, +, typ) \ REDUCE1_OP(product1, *, typ) \ + REDUCEFULL_OP(sumfull, +, typ) \ + REDUCEFULL_OP(productfull, *, typ) \ ENTRY_BINARY_OPS(typ) \ ENTRY_UNARY_OPS(typ) \ - ENTRY_REDUCE_OPS(typ) \ + ENTRY_REDUCE1_OPS(typ) \ + ENTRY_REDUCEFULL_OPS(typ) \ EXTREMUM_OP(min, <, typ) \ EXTREMUM_OP(max, >, typ) \ DOTPROD_STRIDED_OP(typ) diff --git a/cbits/arith_lists.h b/cbits/arith_lists.h index 2e37575..58de65a 100644 --- a/cbits/arith_lists.h +++ b/cbits/arith_lists.h @@ -31,5 +31,5 @@ LIST_FUNOP(FU_EXPM1, 18,) LIST_FUNOP(FU_LOG1PEXP, 19,) LIST_FUNOP(FU_LOG1MEXP, 20,) -LIST_REDOP(RO_SUM1, 1,) -LIST_REDOP(RO_PRODUCT1, 2,) +LIST_REDOP(RO_SUM, 1,) +LIST_REDOP(RO_PRODUCT, 2,) diff --git a/src/Data/Array/Mixed/Internal/Arith.hs b/src/Data/Array/Mixed/Internal/Arith.hs index 579c0da..d547084 100644 --- a/src/Data/Array/Mixed/Internal/Arith.hs +++ b/src/Data/Array/Mixed/Internal/Arith.hs @@ -33,6 +33,9 @@ import Data.Array.Mixed.Internal.Arith.Foreign import Data.Array.Mixed.Internal.Arith.Lists +-- TODO: need to sort strides for reduction-like functions so that the C inner-loop specialisation has some chance of working even after transposition + + -- TODO: test all the cases of this thing with various input strides liftVEltwise1 :: (Storable a, Storable b) => SNat n @@ -186,7 +189,7 @@ vectorRedInnerOp sn@SNat valconv ptrconv fscale fred (RS.A (RG.A sh (OI.T stride -- precondition that there are no such dimensions in its input). replDims = map (== 0) strides -- filter out replicated dimensions - (shF, stridesF) = unzip $ map fst $ filter (not . snd) (zip (zip sh strides) replDims) + (shF, stridesF) = unzip [(n, s) | (n, s, False) <- zip3 sh strides replDims] -- replace replicated dimensions with ones shOnes = zipWith (\n repl -> if repl then 1 else n) sh replDims ndimsF = length shF -- > 0, otherwise `last strides == 0` @@ -213,6 +216,48 @@ vectorRedInnerOp sn@SNat valconv ptrconv fscale fred (RS.A (RG.A sh (OI.T stride . RS.fromVector @_ @lenFm1 (init shF) -- the partially-reversed result array <$> VS.unsafeFreeze outvR +-- TODO: test handling of negative strides +-- | Reduce full array +{-# NOINLINE vectorRedFullOp #-} +vectorRedFullOp :: forall a b n. (Num a, Storable a) + => SNat n + -> (a -> Int -> a) + -> (b -> a) + -> (Ptr a -> Ptr b) + -> (Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr b -> IO b) -- ^ reduction kernel + -> RS.Array n a -> a +vectorRedFullOp _ scaleval valbackconv ptrconv fred (RS.A (RG.A sh (OI.T strides offset vec))) + | null sh = vec VS.! offset -- 0D array has one element + | any (<= 0) sh = 0 + -- now the input array is nonempty + | all (== 0) strides = fromIntegral (product sh) * vec VS.! offset + -- now there is at least one non-replicated dimension + | otherwise = + let -- replicated dimensions: dimensions with zero stride. The reduction + -- kernel need not concern itself with those (and in fact has a + -- precondition that there are no such dimensions in its input). + replDims = map (== 0) strides + -- filter out replicated dimensions + (shF, stridesF) = unzip [(n, s) | (n, s, False) <- zip3 sh strides replDims] + ndimsF = length shF -- > 0, otherwise `all (== 0) strides` + -- we should scale up the output this many times to account for the replicated dimensions + multiplier = product [n | (n, True) <- zip sh replDims] + + -- reversed dimensions: dimensions with negative stride. Reversal is + -- irrelevant for a reduction, and indeed the kernel has a + -- precondition that there are no such dimensions. + revDims = map (< 0) stridesF + stridesR = map abs stridesF + offsetR = offset + sum (zipWith3 (\rev n s -> if rev then (n - 1) * s else 0) revDims shF stridesF) + -- The *R values give an array with strides all > 0, hence the + -- left-most element is at offsetR. + in unsafePerformIO $ do + VS.unsafeWith (VS.fromListN ndimsF (map fromIntegral shF)) $ \pshF -> + VS.unsafeWith (VS.fromListN ndimsF (map fromIntegral stridesR)) $ \pstridesR -> + VS.unsafeWith (VS.slice offsetR (VS.length vec - offsetR) vec) $ \pvecR -> + (`scaleval` fromIntegral multiplier) . valbackconv + <$> fred (fromIntegral ndimsF) pshF pstridesR (ptrconv pvecR) + -- TODO: test this function -- | Find extremum (minindex ("argmin") or maxindex) in full array {-# NOINLINE vectorExtremumOp #-} @@ -232,7 +277,7 @@ vectorExtremumOp ptrconv fextrem (RS.A (RG.A sh (OI.T strides offset vec))) -- precondition that there are no such dimensions in its input). replDims = map (== 0) strides -- filter out replicated dimensions - (shF, stridesF) = unzip $ map fst $ filter (not . snd) (zip (zip sh strides) replDims) + (shF, stridesF) = unzip [(n, s) | (n, s, False) <- zip3 sh strides replDims] ndimsF = length shF -- > 0, because not all strides were <=0 -- un-reverse reversed dimensions @@ -380,16 +425,29 @@ $(fmap concat . forM floatTypesList $ \arithtype -> do ,do body <- [| \sn -> liftVEltwise1 sn (vectorOp1 id $c_op) |] return $ FunD name [Clause [] (NormalB body) []]]) +mulWithInt :: Num a => a -> Int -> a +mulWithInt a i = a * fromIntegral i + $(fmap concat . forM typesList $ \arithtype -> do let ttyp = conT (atType arithtype) fmap concat . forM [minBound..maxBound] $ \arithop -> do - let name = mkName (aroName arithop ++ "Vector" ++ nameBase (atType arithtype)) - c_op = varE (mkName ("c_reduce_" ++ atCName arithtype)) `appE` litE (integerL (fromIntegral (aroEnum arithop))) + let scaleVar = case arithop of + RO_SUM -> varE 'mulWithInt + RO_PRODUCT -> varE '(^) + let name1 = mkName (aroName arithop ++ "1Vector" ++ nameBase (atType arithtype)) + namefull = mkName (aroName arithop ++ "FullVector" ++ nameBase (atType arithtype)) + c_op1 = varE (mkName ("c_reduce1_" ++ atCName arithtype)) `appE` litE (integerL (fromIntegral (aroEnum arithop))) + c_opfull = varE (mkName ("c_reducefull_" ++ atCName arithtype)) `appE` litE (integerL (fromIntegral (aroEnum arithop))) c_scale_op = varE (mkName ("c_binary_" ++ atCName arithtype ++ "_sv")) `appE` litE (integerL (fromIntegral (aboEnum BO_MUL))) - sequence [SigD name <$> + sequence [SigD name1 <$> [t| forall n. SNat n -> RS.Array (n + 1) $ttyp -> RS.Array n $ttyp |] - ,do body <- [| \sn -> vectorRedInnerOp sn id id $c_scale_op $c_op |] - return $ FunD name [Clause [] (NormalB body) []]]) + ,do body <- [| \sn -> vectorRedInnerOp sn id id $c_scale_op $c_op1 |] + return $ FunD name1 [Clause [] (NormalB body) []] + ,SigD namefull <$> + [t| forall n. SNat n -> RS.Array n $ttyp -> $ttyp |] + ,do body <- [| \sn -> vectorRedFullOp sn $scaleVar id id $c_opfull |] + return $ FunD namefull [Clause [] (NormalB body) []] + ]) $(fmap concat . forM typesList $ \arithtype -> fmap concat . forM ["min", "max"] $ \fname -> do @@ -406,7 +464,7 @@ $(fmap concat . forM typesList $ \arithtype -> do name = mkName ("dotprodVector" ++ nameBase (atType arithtype)) c_op = varE (mkName ("c_dotprod_" ++ atCName arithtype)) c_op_strided = varE (mkName ("c_dotprod_" ++ atCName arithtype ++ "_strided")) - c_red_op = varE (mkName ("c_reduce_" ++ atCName arithtype)) `appE` litE (integerL (fromIntegral (aroEnum RO_SUM1))) + c_red_op = varE (mkName ("c_reduce1_" ++ atCName arithtype)) `appE` litE (integerL (fromIntegral (aroEnum RO_SUM))) sequence [SigD name <$> [t| RS.Array 1 $ttyp -> RS.Array 1 $ttyp -> $ttyp |] ,do body <- [| vectorDotprodOp id id $c_red_op $c_op $c_op_strided |] @@ -439,19 +497,31 @@ intWidBranch2 ss sv32 vs32 vv32 sv64 vs64 vv64 sn | finiteBitSize (undefined :: i) == 64 = liftVEltwise2 sn (vectorOp2 @i @Int64 fromIntegral castPtr ss sv64 vs64 vv64) | otherwise = error "Unsupported Int width" -intWidBranchRed :: forall i n. (FiniteBits i, Storable i, Integral i) - => -- int32 - (Int64 -> Ptr Int32 -> Int32 -> Ptr Int32 -> IO ()) -- ^ scale by constant - -> (Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int32 -> Ptr Int32 -> IO ()) -- ^ reduction kernel - -- int64 - -> (Int64 -> Ptr Int64 -> Int64 -> Ptr Int64 -> IO ()) -- ^ scale by constant - -> (Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> IO ()) -- ^ reduction kernel - -> (SNat n -> RS.Array (n + 1) i -> RS.Array n i) -intWidBranchRed fsc32 fred32 fsc64 fred64 sn +intWidBranchRed1 :: forall i n. (FiniteBits i, Storable i, Integral i) + => -- int32 + (Int64 -> Ptr Int32 -> Int32 -> Ptr Int32 -> IO ()) -- ^ scale by constant + -> (Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int32 -> Ptr Int32 -> IO ()) -- ^ reduction kernel + -- int64 + -> (Int64 -> Ptr Int64 -> Int64 -> Ptr Int64 -> IO ()) -- ^ scale by constant + -> (Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> IO ()) -- ^ reduction kernel + -> (SNat n -> RS.Array (n + 1) i -> RS.Array n i) +intWidBranchRed1 fsc32 fred32 fsc64 fred64 sn | finiteBitSize (undefined :: i) == 32 = vectorRedInnerOp @i @Int32 sn fromIntegral castPtr fsc32 fred32 | finiteBitSize (undefined :: i) == 64 = vectorRedInnerOp @i @Int64 sn fromIntegral castPtr fsc64 fred64 | otherwise = error "Unsupported Int width" +intWidBranchRedFull :: forall i n. (FiniteBits i, Storable i, Integral i) + => (i -> Int -> i) -- ^ scale op + -- int32 + -> (Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int32 -> IO Int32) -- ^ reduction kernel + -- int64 + -> (Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int64 -> IO Int64) -- ^ reduction kernel + -> (SNat n -> RS.Array n i -> i) +intWidBranchRedFull fsc fred32 fred64 sn + | finiteBitSize (undefined :: i) == 32 = vectorRedFullOp @i @Int32 sn fsc fromIntegral castPtr fred32 + | finiteBitSize (undefined :: i) == 64 = vectorRedFullOp @i @Int64 sn fsc fromIntegral castPtr fred64 + | otherwise = error "Unsupported Int width" + intWidBranchExtr :: forall i n. (FiniteBits i, Storable i, Integral i) => -- int32 (Ptr Int64 -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr Int32 -> IO ()) -- ^ extremum kernel @@ -487,6 +557,8 @@ class NumElt a where numEltSignum :: SNat n -> RS.Array n a -> RS.Array n a numEltSum1Inner :: SNat n -> RS.Array (n + 1) a -> RS.Array n a numEltProduct1Inner :: SNat n -> RS.Array (n + 1) a -> RS.Array n a + numEltSumFull :: SNat n -> RS.Array n a -> a + numEltProductFull :: SNat n -> RS.Array n a -> a numEltMinIndex :: RS.Array n a -> [Int] numEltMaxIndex :: RS.Array n a -> [Int] numEltDotprod :: RS.Array 1 a -> RS.Array 1 a -> a @@ -500,6 +572,8 @@ instance NumElt Int32 where numEltSignum = signumVectorInt32 numEltSum1Inner = sum1VectorInt32 numEltProduct1Inner = product1VectorInt32 + numEltSumFull = sumFullVectorInt32 + numEltProductFull = productFullVectorInt32 numEltMinIndex = minindexVectorInt32 numEltMaxIndex = maxindexVectorInt32 numEltDotprod = dotprodVectorInt32 @@ -513,6 +587,8 @@ instance NumElt Int64 where numEltSignum = signumVectorInt64 numEltSum1Inner = sum1VectorInt64 numEltProduct1Inner = product1VectorInt64 + numEltSumFull = sumFullVectorInt64 + numEltProductFull = productFullVectorInt64 numEltMinIndex = minindexVectorInt64 numEltMaxIndex = maxindexVectorInt64 numEltDotprod = dotprodVectorInt64 @@ -526,6 +602,8 @@ instance NumElt Float where numEltSignum = signumVectorFloat numEltSum1Inner = sum1VectorFloat numEltProduct1Inner = product1VectorFloat + numEltSumFull = sumFullVectorFloat + numEltProductFull = productFullVectorFloat numEltMinIndex = minindexVectorFloat numEltMaxIndex = maxindexVectorFloat numEltDotprod = dotprodVectorFloat @@ -539,6 +617,8 @@ instance NumElt Double where numEltSignum = signumVectorDouble numEltSum1Inner = sum1VectorDouble numEltProduct1Inner = product1VectorDouble + numEltSumFull = sumFullVectorDouble + numEltProductFull = productFullVectorDouble numEltMinIndex = minindexVectorDouble numEltMaxIndex = maxindexVectorDouble numEltDotprod = dotprodVectorDouble @@ -556,16 +636,18 @@ instance NumElt Int where numEltNeg = intWidBranch1 @Int (c_unary_i32 (auoEnum UO_NEG)) (c_unary_i64 (auoEnum UO_NEG)) numEltAbs = intWidBranch1 @Int (c_unary_i32 (auoEnum UO_ABS)) (c_unary_i64 (auoEnum UO_ABS)) numEltSignum = intWidBranch1 @Int (c_unary_i32 (auoEnum UO_SIGNUM)) (c_unary_i64 (auoEnum UO_SIGNUM)) - numEltSum1Inner = intWidBranchRed @Int - (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce_i32 (aroEnum RO_SUM1)) - (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce_i64 (aroEnum RO_SUM1)) - numEltProduct1Inner = intWidBranchRed @Int - (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce_i32 (aroEnum RO_PRODUCT1)) - (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce_i64 (aroEnum RO_PRODUCT1)) + numEltSum1Inner = intWidBranchRed1 @Int + (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce1_i32 (aroEnum RO_SUM)) + (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce1_i64 (aroEnum RO_SUM)) + numEltProduct1Inner = intWidBranchRed1 @Int + (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce1_i32 (aroEnum RO_PRODUCT)) + (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce1_i64 (aroEnum RO_PRODUCT)) + numEltSumFull = intWidBranchRedFull @Int (*) (c_reducefull_i32 (aroEnum RO_SUM)) (c_reducefull_i64 (aroEnum RO_SUM)) + numEltProductFull = intWidBranchRedFull @Int (^) (c_reducefull_i32 (aroEnum RO_PRODUCT)) (c_reducefull_i64 (aroEnum RO_PRODUCT)) numEltMinIndex = intWidBranchExtr @Int c_extremum_min_i32 c_extremum_min_i64 numEltMaxIndex = intWidBranchExtr @Int c_extremum_max_i32 c_extremum_max_i64 - numEltDotprod = intWidBranchDotprod @Int (c_reduce_i32 (aroEnum RO_SUM1)) c_dotprod_i32 c_dotprod_i32_strided - (c_reduce_i64 (aroEnum RO_SUM1)) c_dotprod_i64 c_dotprod_i64_strided + numEltDotprod = intWidBranchDotprod @Int (c_reduce1_i32 (aroEnum RO_SUM)) c_dotprod_i32 c_dotprod_i32_strided + (c_reduce1_i64 (aroEnum RO_SUM)) c_dotprod_i64 c_dotprod_i64_strided instance NumElt CInt where numEltAdd = intWidBranch2 @CInt (+) @@ -580,16 +662,18 @@ instance NumElt CInt where numEltNeg = intWidBranch1 @CInt (c_unary_i32 (auoEnum UO_NEG)) (c_unary_i64 (auoEnum UO_NEG)) numEltAbs = intWidBranch1 @CInt (c_unary_i32 (auoEnum UO_ABS)) (c_unary_i64 (auoEnum UO_ABS)) numEltSignum = intWidBranch1 @CInt (c_unary_i32 (auoEnum UO_SIGNUM)) (c_unary_i64 (auoEnum UO_SIGNUM)) - numEltSum1Inner = intWidBranchRed @CInt - (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce_i32 (aroEnum RO_SUM1)) - (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce_i64 (aroEnum RO_SUM1)) - numEltProduct1Inner = intWidBranchRed @CInt - (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce_i32 (aroEnum RO_PRODUCT1)) - (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce_i64 (aroEnum RO_PRODUCT1)) + numEltSum1Inner = intWidBranchRed1 @CInt + (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce1_i32 (aroEnum RO_SUM)) + (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce1_i64 (aroEnum RO_SUM)) + numEltProduct1Inner = intWidBranchRed1 @CInt + (c_binary_i32_sv (aboEnum BO_MUL)) (c_reduce1_i32 (aroEnum RO_PRODUCT)) + (c_binary_i64_sv (aboEnum BO_MUL)) (c_reduce1_i64 (aroEnum RO_PRODUCT)) + numEltSumFull = intWidBranchRedFull @CInt mulWithInt (c_reducefull_i32 (aroEnum RO_SUM)) (c_reducefull_i64 (aroEnum RO_SUM)) + numEltProductFull = intWidBranchRedFull @CInt (^) (c_reducefull_i32 (aroEnum RO_PRODUCT)) (c_reducefull_i64 (aroEnum RO_PRODUCT)) numEltMinIndex = intWidBranchExtr @CInt c_extremum_min_i32 c_extremum_min_i64 numEltMaxIndex = intWidBranchExtr @CInt c_extremum_max_i32 c_extremum_max_i64 - numEltDotprod = intWidBranchDotprod @CInt (c_reduce_i32 (aroEnum RO_SUM1)) c_dotprod_i32 c_dotprod_i32_strided - (c_reduce_i64 (aroEnum RO_SUM1)) c_dotprod_i64 c_dotprod_i64_strided + numEltDotprod = intWidBranchDotprod @CInt (c_reduce1_i32 (aroEnum RO_SUM)) c_dotprod_i32 c_dotprod_i32_strided + (c_reduce1_i64 (aroEnum RO_SUM)) c_dotprod_i64 c_dotprod_i64_strided class FloatElt a where floatEltDiv :: SNat n -> RS.Array n a -> RS.Array n a -> RS.Array n a diff --git a/src/Data/Array/Mixed/Internal/Arith/Foreign.hs b/src/Data/Array/Mixed/Internal/Arith/Foreign.hs index a406dab..ca96093 100644 --- a/src/Data/Array/Mixed/Internal/Arith/Foreign.hs +++ b/src/Data/Array/Mixed/Internal/Arith/Foreign.hs @@ -49,9 +49,13 @@ $(fmap concat . forM floatTypesList $ \arithtype -> do $(fmap concat . forM typesList $ \arithtype -> do let ttyp = conT (atType arithtype) - let base = "reduce_" ++ atCName arithtype - pure . ForeignD . ImportF CCall Unsafe ("oxarop_" ++ base) (mkName ("c_" ++ base)) <$> - [t| CInt -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> Ptr $ttyp -> IO () |]) + let base1 = "reduce1_" ++ atCName arithtype + basefull = "reducefull_" ++ atCName arithtype + sequence + [ForeignD . ImportF CCall Unsafe ("oxarop_" ++ base1) (mkName ("c_" ++ base1)) <$> + [t| CInt -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> Ptr $ttyp -> IO () |] + ,ForeignD . ImportF CCall Unsafe ("oxarop_" ++ basefull) (mkName ("c_" ++ basefull)) <$> + [t| CInt -> Int64 -> Ptr Int64 -> Ptr Int64 -> Ptr $ttyp -> IO $ttyp |]]) $(fmap concat . forM typesList $ \arithtype -> fmap concat . forM ["min", "max"] $ \fname -> do diff --git a/src/Data/Array/Mixed/XArray.hs b/src/Data/Array/Mixed/XArray.hs index 08295cd..fa753bb 100644 --- a/src/Data/Array/Mixed/XArray.hs +++ b/src/Data/Array/Mixed/XArray.hs @@ -240,8 +240,8 @@ transpose2 ssh1 ssh2 (XArray arr) , let n1 = ssxLength ssh1 = XArray (S.transpose (ssxIotaFrom n1 ssh2 ++ ssxIotaFrom 0 ssh1) arr) -sumFull :: (Storable a, NumElt a) => XArray sh a -> a -sumFull (XArray arr) = +sumFull :: (Storable a, NumElt a) => StaticShX sh -> XArray sh a -> a +sumFull _ (XArray arr) = S.unScalar $ numEltSum1Inner (SNat @0) $ S.fromVector [product (S.shapeL arr)] $ diff --git a/src/Data/Array/Nested.hs b/src/Data/Array/Nested.hs index 3a60305..53417bd 100644 --- a/src/Data/Array/Nested.hs +++ b/src/Data/Array/Nested.hs @@ -6,7 +6,7 @@ module Data.Array.Nested ( ListR(ZR, (:::)), IxR(.., ZIR, (:.:)), IIxR, ShR(.., ZSR, (:$:)), IShR, - rshape, rrank, rsize, rindex, rindexPartial, rgenerate, rsumOuter1, + rshape, rrank, rsize, rindex, rindexPartial, rgenerate, rsumOuter1, rsumAllPrim, rtranspose, rappend, rconcat, rscalar, rfromVector, rtoVector, runScalar, rrerank, rreplicate, rreplicateScal, rfromListOuter, rfromList1, rfromList1Prim, rtoListOuter, rtoList1, @@ -26,7 +26,7 @@ module Data.Array.Nested ( ListS(ZS, (::$)), IxS(.., ZIS, (:.$)), IIxS, ShS(.., ZSS, (:$$)), KnownShS(..), - sshape, srank, ssize, sindex, sindexPartial, sgenerate, ssumOuter1, + sshape, srank, ssize, sindex, sindexPartial, sgenerate, ssumOuter1, ssumAllPrim, stranspose, sappend, sscalar, sfromVector, stoVector, sunScalar, -- TODO: sconcat? What should its type be? srerank, @@ -48,7 +48,7 @@ module Data.Array.Nested ( ShX(.., ZSX, (:$%)), KnownShX(..), StaticShX(.., ZKX, (:!%)), SMayNat(..), - mshape, mrank, msize, mindex, mindexPartial, mgenerate, msumOuter1, + mshape, mrank, msize, mindex, mindexPartial, mgenerate, msumOuter1, msumAllPrim, mtranspose, mappend, mconcat, mscalar, mfromVector, mtoVector, munScalar, mrerank, mreplicate, mreplicateScal, mfromListOuter, mfromList1, mfromList1Prim, mtoListOuter, mtoList1, diff --git a/src/Data/Array/Nested/Internal/Mixed.hs b/src/Data/Array/Nested/Internal/Mixed.hs index 594383c..215313e 100644 --- a/src/Data/Array/Nested/Internal/Mixed.hs +++ b/src/Data/Array/Nested/Internal/Mixed.hs @@ -713,6 +713,9 @@ msumOuter1 :: forall sh n a. (NumElt a, PrimElt a) => Mixed (n : sh) a -> Mixed sh a msumOuter1 = fromPrimitive . msumOuter1P @sh @n @a . toPrimitive +msumAllPrim :: (PrimElt a, NumElt a) => Mixed sh a -> a +msumAllPrim (toPrimitive -> M_Primitive sh arr) = X.sumFull (ssxFromShape sh) arr + mappend :: forall n m sh a. Elt a => Mixed (n : sh) a -> Mixed (m : sh) a -> Mixed (AddMaybe n m : sh) a mappend arr1 arr2 = mlift2 (snm :!% ssh) f arr1 arr2 diff --git a/src/Data/Array/Nested/Internal/Ranked.hs b/src/Data/Array/Nested/Internal/Ranked.hs index bd37e7a..74b2186 100644 --- a/src/Data/Array/Nested/Internal/Ranked.hs +++ b/src/Data/Array/Nested/Internal/Ranked.hs @@ -282,6 +282,9 @@ rsumOuter1 :: forall n a. (NumElt a, PrimElt a) => Ranked (n + 1) a -> Ranked n a rsumOuter1 = rfromPrimitive . rsumOuter1P . rtoPrimitive +rsumAllPrim :: (PrimElt a, NumElt a) => Ranked n a -> a +rsumAllPrim (Ranked arr) = msumAllPrim arr + rtranspose :: forall n a. Elt a => PermR -> Ranked n a -> Ranked n a rtranspose perm arr | sn@SNat <- rrank arr diff --git a/src/Data/Array/Nested/Internal/Shaped.hs b/src/Data/Array/Nested/Internal/Shaped.hs index f50ed28..ea979fa 100644 --- a/src/Data/Array/Nested/Internal/Shaped.hs +++ b/src/Data/Array/Nested/Internal/Shaped.hs @@ -277,6 +277,9 @@ ssumOuter1 :: forall sh n a. (NumElt a, PrimElt a) => Shaped (n : sh) a -> Shaped sh a ssumOuter1 = sfromPrimitive . ssumOuter1P . stoPrimitive +ssumAllPrim :: (PrimElt a, NumElt a) => Shaped n a -> a +ssumAllPrim (Shaped arr) = msumAllPrim arr + stranspose :: forall is sh a. (IsPermutation is, Rank is <= Rank sh, Elt a) => Perm is -> Shaped sh a -> Shaped (PermutePrefix is sh) a stranspose perm sarr@(Shaped arr) -- cgit v1.2.3-70-g09d2