1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <math.h>
typedef int32_t i32;
typedef int64_t i64;
#define COMM_OP(name, op, typ) \
void oxarop_ ## name ## _ ## typ ## _sv(i64 n, typ *out, typ x, const typ *y) { \
for (i64 i = 0; i < n; i++) out[i] = x op y[i]; \
} \
void oxarop_ ## name ## _ ## typ ## _vv(i64 n, typ *out, const typ *x, const typ *y) { \
for (i64 i = 0; i < n; i++) out[i] = x[i] op y[i]; \
}
#define NONCOMM_OP(name, op, typ) \
COMM_OP(name, op, typ) \
void oxarop_ ## name ## _ ## typ ## _vs(i64 n, typ *out, const typ *x, typ y) { \
for (i64 i = 0; i < n; i++) out[i] = x[i] op y; \
}
#define UNARY_OP(name, op, typ) \
void oxarop_ ## name ## _ ## typ(i64 n, typ *out, const typ *x) { \
for (i64 i = 0; i < n; i++) out[i] = op(x[i]); \
}
#define GEN_ABS(x) \
_Generic((x), \
int: abs, \
long: labs, \
long long: llabs, \
float: fabsf, \
double: fabs)(x)
// This does not result in multiple loads with GCC 13.
#define GEN_SIGNUM(x) ((x) < 0 ? -1 : (x) > 0 ? 1 : 0)
// Walk a orthotope-style strided array, except for the inner dimension. The
// body is run for every "inner vector".
#define TARRAY_WALK_NOINNER(again_label_name, rank, shape, strides, body) \
do { \
i64 idx[(rank) - 1]; \
memset(idx, 0, ((rank) - 1) * sizeof(idx[0])); \
i64 arrlinidx = 0; \
i64 outlinidx = 0; \
again_label_name: \
{ \
body \
} \
for (i64 dim = (rank) - 2; dim >= 0; dim--) { \
if (++idx[dim] < (shape)[dim]) { \
arrlinidx += (strides)[dim]; \
outlinidx++; \
goto again_label_name; \
} \
arrlinidx -= (idx[dim] - 1) * (strides)[dim]; \
idx[dim] = 0; \
} \
} while (false)
// preconditions:
// - all strides are >0
// - shape is everywhere >0
// - rank is >= 1
// - out has capacity for (shape[0] * ... * shape[rank - 2]) elements
// Reduces along the innermost dimension.
// 'out' will be filled densely in linearisation order.
#define REDUCE1_OP(name, op, typ) \
void oxarop_ ## name ## _ ## typ(i64 rank, const i64 *shape, const i64 *strides, typ *out, const typ *arr) { \
if (strides[rank - 1] == 1) { \
TARRAY_WALK_NOINNER(again1, rank, shape, strides, { \
typ accum = arr[arrlinidx]; \
for (i64 i = 1; i < shape[rank - 1]; i++) { \
accum = accum op arr[arrlinidx + i]; \
} \
out[outlinidx] = accum; \
}); \
} else { \
TARRAY_WALK_NOINNER(again2, rank, shape, strides, { \
typ accum = arr[arrlinidx]; \
for (i64 i = 1; i < shape[rank - 1]; i++) { \
accum = accum op arr[arrlinidx + strides[rank - 1] * i]; \
} \
out[outlinidx] = accum; \
}); \
} \
}
#define NUM_TYPES_LOOP_XLIST \
X(i32) X(i64) X(double) X(float)
#define X(typ) \
COMM_OP(add, +, typ) \
NONCOMM_OP(sub, -, typ) \
COMM_OP(mul, *, typ) \
UNARY_OP(neg, -, typ) \
UNARY_OP(abs, GEN_ABS, typ) \
UNARY_OP(signum, GEN_SIGNUM, typ) \
REDUCE1_OP(sum1, +, typ) \
REDUCE1_OP(product1, *, typ)
NUM_TYPES_LOOP_XLIST
#undef X
|