From be2bc936957d4fcbc2001ca48909bb2ce8d3f5c7 Mon Sep 17 00:00:00 2001 From: tomsmeding Date: Fri, 19 Apr 2019 23:32:06 +0200 Subject: Working Futhark! 🎉 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That was actually easier than expected, and (surprise!) it runs ORDERS OF MAGNITUDE faster than my C++ implementation... C++ takes ~1min on my i5-4278U CPU @ 2.60GHz using all 4 virtual cores; the Futhark takes ~11sec on 1 core. Not sure whether my own code is so terribly bad or whether Futhark is cool, but this is cool nevertheless. --- aberth/.gitignore | 2 ++ aberth/Makefile | 23 +++++++++++++----- aberth/aberth.cpp | 62 ++++++++++++++++++++++++++++++++++++++++++++---- aberth/aberth_kernel.fut | 45 +++++++++++++++++++++++++++++++---- 4 files changed, 117 insertions(+), 15 deletions(-) (limited to 'aberth') diff --git a/aberth/.gitignore b/aberth/.gitignore index 66c7e62..db0dbae 100644 --- a/aberth/.gitignore +++ b/aberth/.gitignore @@ -1,2 +1,4 @@ aberth lib/ +aberth_kernel.c +aberth_kernel.h diff --git a/aberth/Makefile b/aberth/Makefile index c5a67cf..f610c01 100644 --- a/aberth/Makefile +++ b/aberth/Makefile @@ -1,20 +1,31 @@ CXX = g++ -CXXFLAGS = -Wall -Wextra -std=c++17 -O3 -fwrapv -ffast-math -march=native -mtune=native -LDFLAGS = -pthread +CXXFLAGS = -Wall -Wextra -std=c++17 -fwrapv $(OPTFLAGS) +OPTFLAGS = -O3 -ffast-math -march=native -mtune=native +LDFLAGS = -pthread -framework OpenCL + +TARGETS = aberth + +OBJECTS = $(patsubst %.cpp,%.o,$(wildcard *.cpp)) .PHONY: all clean remake -all: aberth +all: $(TARGETS) clean: - rm -f aberth *.o + rm -f $(TARGETS) *.o aberth_kernel.h aberth_kernel.c remake: clean $(MAKE) all -aberth: $(patsubst %.cpp,%.o,$(wildcard *.cpp)) ../lodepng.o +aberth: $(OBJECTS) aberth_kernel.o ../lodepng.o $(CXX) -o $@ $^ $(LDFLAGS) -%.o: %.cpp $(wildcard *.h) +aberth_kernel.h: aberth_kernel.fut + futhark c --library $^ + +aberth_kernel.o: aberth_kernel.h + gcc -c $(OPTFLAGS) -o $@ aberth_kernel.c + +$(OBJECTS): %.o: %.cpp $(wildcard *.h) aberth_kernel.h $(CXX) $(CXXFLAGS) -c -o $@ $< diff --git a/aberth/aberth.cpp b/aberth/aberth.cpp index 2adf207..d002ba4 100644 --- a/aberth/aberth.cpp +++ b/aberth/aberth.cpp @@ -16,6 +16,10 @@ #include #include "../lodepng.h" +extern "C" { +#include "aberth_kernel.h" +} + using namespace std; @@ -132,7 +136,7 @@ struct AberthState { } } - // Gauss-Seidel-style step where the updated values are already used in the current iteration + // Lagrange-style step where the new elements are computed in parallel from the previous values bool step() { array pairs; for (int i = 0; i < N - 1; i++) { @@ -143,6 +147,7 @@ struct AberthState { bool allConverged = true; + AApprox newapprox; AApprox offsets; for (int i = 0; i < N; i++) { Com pval = eval(poly, approx[i]); @@ -153,12 +158,15 @@ struct AberthState { for (int j = i + 1; j < N; j++) sum += pairs[N * i + j]; offsets[i] = quo / (1.0 - quo * sum); - approx[i] -= offsets[i]; + // approx[i] -= offsets[i]; + newapprox[i] = approx[i] - offsets[i]; - double sval = eval(boundPoly, abs(approx[i])); + double sval = eval(boundPoly, abs(newapprox[i])); if (abs(pval) > 1e-9 * sval) allConverged = false; } + approx = newapprox; + return allConverged; } @@ -332,6 +340,48 @@ static vector drawImage(int W, int H, const vector &counts, int ma return image; } +static vector invoke_kernel( + int32_t width, int32_t height, + double left, double top, double right, double bottom, + int32_t seed) { + + futhark_context_config *config = futhark_context_config_new(); + // futhark_context_config_select_device_interactively(config); + + futhark_context *ctx = futhark_context_new(config); + + auto check_ret = [ctx](int ret) { + if (ret != 0) { + char *str = futhark_context_get_error(ctx); + cerr << str << endl; + free(str); + exit(1); + } + }; + + futhark_i32_1d *dest_arr; + + check_ret(futhark_entry_main_all( + ctx, &dest_arr, + width, height, + left, top, right, bottom, + seed)); + + check_ret(futhark_context_sync(ctx)); + + // Shouldn't free _this_ pointer, apparently + int64_t *shape = futhark_shape_i32_1d(ctx, dest_arr); + assert(shape[0] == width * height); + + vector buffer(width * height); + check_ret(futhark_values_i32_1d(ctx, dest_arr, buffer.data())); + check_ret(futhark_free_i32_1d(ctx, dest_arr)); + + futhark_context_config_free(config); + + return buffer; +} + int main(int argc, char **argv) { srandomdev(); @@ -339,7 +389,11 @@ int main(int argc, char **argv) { vector counts; if (argc <= 1) { - tie(W, H, counts) = computeCounts(); + // tie(W, H, counts) = computeCounts(); + W = H = 900; + Com bottomLeft = Com(-1.5, -1.5); + Com topRight = Com(1.5, 1.5); + counts = invoke_kernel(W, H, bottomLeft.real(), topRight.imag(), topRight.real(), bottomLeft.imag(), 42); writeCounts(W, H, counts, "out.txt"); } else if (argc == 2) { tie(W, H, counts) = readCounts(argv[1]); diff --git a/aberth/aberth_kernel.fut b/aberth/aberth_kernel.fut index a52d8b6..815c254 100644 --- a/aberth/aberth_kernel.fut +++ b/aberth/aberth_kernel.fut @@ -29,7 +29,7 @@ let evaln_d (p: poly) (nterms: i32) (pt: f64): f64 = let eval_d (p: poly) (pt: f64): f64 = evaln_d p (length p) pt let derivative (p: poly): *poly = - map (\(i, v) -> f64.i32 i * v) (zip (1.. f64.i32 i * v) (zip (1.. c64.mag p > 1e-9 * s) pvals svals + let conditions = map2 (\p s -> c64.mag p <= 1e-9 * s) pvals svals let all_converged = all id conditions in (all_converged, approx) @@ -123,4 +123,39 @@ let derbyshire_at_index (index: i32): *poly = in tabulate PolyN (\i -> f64.i32 (i32.get_bit i bitfield * 2 - 1)) -entry main: i32 = 42 +let point_index + (width: i32) (height: i32) + (top_left: complex) (bottom_right: complex) + (pt: complex) + : i32 = + let x = (c64.re pt - c64.re top_left) / (c64.re bottom_right - c64.re top_left) + let y = (c64.im pt - c64.im top_left) / (c64.im bottom_right - c64.im top_left) + let xi = t64 (x * r64 (width - 1)) + let yi = t64 (y * r64 (height - 1)) + in width * yi + xi + +entry main_job + (start_index: i32) (num_polys: i32) + (width: i32) (height: i32) + (left: f64) (top: f64) (right: f64) (bottom: f64) + (seed: i32) + : []i32 = + -- Unnecessary to give each polynomial a different seed + let rng = rand_engine.rng_from_seed [seed] + let top_left = c64.mk left top + let bottom_right = c64.mk right bottom + let indices = flatten + (map (\idx -> + let p = derbyshire_at_index idx + let (_, pts) = aberth.aberth p rng + let indices = map (point_index width height top_left bottom_right) pts + in indices) + (start_index ..< start_index + num_polys)) + in reduce_by_index (replicate (width * height) 0) (+) 0 indices (replicate (length indices) 1) + +entry main_all + (width: i32) (height: i32) + (left: f64) (top: f64) (right: f64) (bottom: f64) + (seed: i32) + : []i32 = + main_job 0 (1 << N) width height left top right bottom seed -- cgit v1.2.3-70-g09d2