From be2bc936957d4fcbc2001ca48909bb2ce8d3f5c7 Mon Sep 17 00:00:00 2001
From: tomsmeding <tom.smeding@gmail.com>
Date: Fri, 19 Apr 2019 23:32:06 +0200
Subject: Working Futhark! 🎉
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

That was actually easier than expected, and (surprise!) it runs ORDERS
OF MAGNITUDE faster than my C++ implementation...
C++ takes ~1min on my i5-4278U CPU @ 2.60GHz using all 4 virtual cores;
the Futhark takes ~11sec on 1 core. Not sure whether my own code is so
terribly bad or whether Futhark is cool, but this is cool nevertheless.
---
 aberth/.gitignore        |  2 ++
 aberth/Makefile          | 23 +++++++++++++-----
 aberth/aberth.cpp        | 62 ++++++++++++++++++++++++++++++++++++++++++++----
 aberth/aberth_kernel.fut | 45 +++++++++++++++++++++++++++++++----
 4 files changed, 117 insertions(+), 15 deletions(-)

(limited to 'aberth')
diff --git a/aberth/.gitignore b/aberth/.gitignore
index 66c7e62..db0dbae 100644
--- a/aberth/.gitignore
+++ b/aberth/.gitignore
@@ -1,2 +1,4 @@
 aberth
 lib/
+aberth_kernel.c
+aberth_kernel.h
diff --git a/aberth/Makefile b/aberth/Makefile
index c5a67cf..f610c01 100644
--- a/aberth/Makefile
+++ b/aberth/Makefile
@@ -1,20 +1,31 @@
 CXX = g++
-CXXFLAGS = -Wall -Wextra -std=c++17 -O3 -fwrapv -ffast-math -march=native -mtune=native
-LDFLAGS = -pthread
+CXXFLAGS = -Wall -Wextra -std=c++17 -fwrapv $(OPTFLAGS)
+OPTFLAGS = -O3 -ffast-math -march=native -mtune=native
+LDFLAGS = -pthread -framework OpenCL
+
+TARGETS = aberth
+
+OBJECTS = $(patsubst %.cpp,%.o,$(wildcard *.cpp))
 
 .PHONY: all clean remake
 
-all: aberth
+all: $(TARGETS)
 
 clean:
-	rm -f aberth *.o
+	rm -f $(TARGETS) *.o aberth_kernel.h aberth_kernel.c
 
 remake: clean
 	$(MAKE) all
 
 
-aberth: $(patsubst %.cpp,%.o,$(wildcard *.cpp)) ../lodepng.o
+aberth: $(OBJECTS) aberth_kernel.o ../lodepng.o
 	$(CXX) -o $@ $^ $(LDFLAGS)
 
-%.o: %.cpp $(wildcard *.h)
+aberth_kernel.h: aberth_kernel.fut
+	futhark c --library $^
+
+aberth_kernel.o: aberth_kernel.h
+	gcc -c $(OPTFLAGS) -o $@ aberth_kernel.c
+
+$(OBJECTS): %.o: %.cpp $(wildcard *.h) aberth_kernel.h
 	$(CXX) $(CXXFLAGS) -c -o $@ $<
diff --git a/aberth/aberth.cpp b/aberth/aberth.cpp
index 2adf207..d002ba4 100644
--- a/aberth/aberth.cpp
+++ b/aberth/aberth.cpp
@@ -16,6 +16,10 @@
 #include <cassert>
 #include "../lodepng.h"
 
+extern "C" {
+#include "aberth_kernel.h"
+}
+
 using namespace std;
 
 
@@ -132,7 +136,7 @@ struct AberthState {
 		}
 	}
 
-	// Gauss-Seidel-style step where the updated values are already used in the current iteration
+    // Lagrange-style step where the new elements are computed in parallel from the previous values
 	bool step() {
 		array<Com, N * N> pairs;
 		for (int i = 0; i < N - 1; i++) {
@@ -143,6 +147,7 @@ struct AberthState {
 
 		bool allConverged = true;
 
+		AApprox newapprox;
 		AApprox offsets;
 		for (int i = 0; i < N; i++) {
 			Com pval = eval(poly, approx[i]);
@@ -153,12 +158,15 @@ struct AberthState {
 			for (int j = i + 1; j < N; j++) sum += pairs[N * i + j];
 			offsets[i] = quo / (1.0 - quo * sum);
 
-			approx[i] -= offsets[i];
+			// approx[i] -= offsets[i];
+			newapprox[i] = approx[i] - offsets[i];
 
-			double sval = eval(boundPoly, abs(approx[i]));
+			double sval = eval(boundPoly, abs(newapprox[i]));
 			if (abs(pval) > 1e-9 * sval) allConverged = false;
 		}
 
+		approx = newapprox;
+
 		return allConverged;
 	}
 
@@ -332,6 +340,48 @@ static vector<uint8_t> drawImage(int W, int H, const vector<int> &counts, int ma
 	return image;
 }
 
+static vector<int32_t> invoke_kernel(
+		int32_t width, int32_t height,
+		double left, double top, double right, double bottom,
+		int32_t seed) {
+
+	futhark_context_config *config = futhark_context_config_new();
+	// futhark_context_config_select_device_interactively(config);
+
+	futhark_context *ctx = futhark_context_new(config);
+
+	auto check_ret = [ctx](int ret) {
+		if (ret != 0) {
+			char *str = futhark_context_get_error(ctx);
+			cerr << str << endl;
+			free(str);
+			exit(1);
+		}
+	};
+
+	futhark_i32_1d *dest_arr;
+
+	check_ret(futhark_entry_main_all(
+			ctx, &dest_arr,
+			width, height,
+			left, top, right, bottom,
+			seed));
+
+	check_ret(futhark_context_sync(ctx));
+
+	// Shouldn't free _this_ pointer, apparently
+	int64_t *shape = futhark_shape_i32_1d(ctx, dest_arr);
+	assert(shape[0] == width * height);
+
+	vector<int32_t> buffer(width * height);
+	check_ret(futhark_values_i32_1d(ctx, dest_arr, buffer.data()));
+	check_ret(futhark_free_i32_1d(ctx, dest_arr));
+
+	futhark_context_config_free(config);
+
+	return buffer;
+}
+
 int main(int argc, char **argv) {
 	srandomdev();
 
@@ -339,7 +389,11 @@ int main(int argc, char **argv) {
 	vector<int> counts;
 
 	if (argc <= 1) {
-		tie(W, H, counts) = computeCounts();
+		// tie(W, H, counts) = computeCounts();
+		W = H = 900;
+		Com bottomLeft = Com(-1.5, -1.5);
+		Com topRight = Com(1.5, 1.5);
+		counts = invoke_kernel(W, H, bottomLeft.real(), topRight.imag(), topRight.real(), bottomLeft.imag(), 42);
 		writeCounts(W, H, counts, "out.txt");
 	} else if (argc == 2) {
 		tie(W, H, counts) = readCounts(argv[1]);
diff --git a/aberth/aberth_kernel.fut b/aberth/aberth_kernel.fut
index a52d8b6..815c254 100644
--- a/aberth/aberth_kernel.fut
+++ b/aberth/aberth_kernel.fut
@@ -29,7 +29,7 @@ let evaln_d (p: poly) (nterms: i32) (pt: f64): f64 =
 let eval_d (p: poly) (pt: f64): f64 = evaln_d p (length p) pt
 
 let derivative (p: poly): *poly = 
-    map (\(i, v) -> f64.i32 i * v) (zip (1..<PolyN) (take (PolyN - 1) p)) ++ [0]
+    map (\(i, v) -> f64.i32 i * v) (zip (1..<PolyN) (drop 1 p)) ++ [0]
 
 -- Cauchy's bound: https://en.wikipedia.org/wiki/Geometrical_properties_of_polynomial_roots#Lagrange's_and_Cauchy's_bounds
 let max_root_norm (p: poly): f64 =
@@ -37,8 +37,8 @@ let max_root_norm (p: poly): f64 =
 
 module aberth = {
     type approx = [N]complex
-	-- bound is 's' in the stop condition formulated at p.189-190 of
-	-- https://link.springer.com/article/10.1007%2FBF02207694
+    -- bound is 's' in the stop condition formulated at p.189-190 of
+    -- https://link.springer.com/article/10.1007%2FBF02207694
     type context = {p: poly, deriv: poly, bound: poly, radius: f64}
 
     let gen_coord (r: f64) (rng: *rand_engine.rng): *(rand_engine.rng, f64) =
@@ -82,7 +82,7 @@ module aberth = {
                            quos sums
         let approx = map2 (c64.-) approx offsets
         let svals = map (eval_d ctx.bound <-< c64.mag) approx
-        let conditions = map2 (\p s -> c64.mag p > 1e-9 * s) pvals svals
+        let conditions = map2 (\p s -> c64.mag p <= 1e-9 * s) pvals svals
         let all_converged = all id conditions
         in (all_converged, approx)
 
@@ -123,4 +123,39 @@ let derbyshire_at_index (index: i32): *poly =
     in tabulate PolyN (\i -> f64.i32 (i32.get_bit i bitfield * 2 - 1))
 
 
-entry main: i32 = 42
+let point_index
+        (width: i32) (height: i32)
+        (top_left: complex) (bottom_right: complex)
+        (pt: complex)
+        : i32 =
+    let x = (c64.re pt - c64.re top_left) / (c64.re bottom_right - c64.re top_left)
+    let y = (c64.im pt - c64.im top_left) / (c64.im bottom_right - c64.im top_left)
+    let xi = t64 (x * r64 (width - 1))
+    let yi = t64 (y * r64 (height - 1))
+    in width * yi + xi
+
+entry main_job
+        (start_index: i32) (num_polys: i32)
+        (width: i32) (height: i32)
+        (left: f64) (top: f64) (right: f64) (bottom: f64)
+        (seed: i32)
+        : []i32 =
+    -- Unnecessary to give each polynomial a different seed
+    let rng = rand_engine.rng_from_seed [seed]
+    let top_left = c64.mk left top
+    let bottom_right = c64.mk right bottom
+    let indices = flatten
+            (map (\idx ->
+                    let p = derbyshire_at_index idx
+                    let (_, pts) = aberth.aberth p rng
+                    let indices = map (point_index width height top_left bottom_right) pts
+                    in indices)
+                 (start_index ..< start_index + num_polys))
+    in reduce_by_index (replicate (width * height) 0) (+) 0 indices (replicate (length indices) 1)
+
+entry main_all
+        (width: i32) (height: i32)
+        (left: f64) (top: f64) (right: f64) (bottom: f64)
+        (seed: i32)
+        : []i32 =
+    main_job 0 (1 << N) width height left top right bottom seed
-- 
cgit v1.3.1