1 files changed, 87 insertions, 15 deletions
diff --git a/mandel.cu b/mandel.cu
index 480daed..386f962 100644
--- a/mandel.cu
+++ b/mandel.cu
@@ -60,34 +60,77 @@ void mandel_free(Mandel *ctx) {
 	delete ctx;
 }
 
+// These macros may assume that the parameters are simple variable names.
+#if 1
+#define MANDEL_FUNCTION_REAL(a, b, a2, b2, x, y) (a2 - b2 + x)
+#define MANDEL_FUNCTION_IMAG(a, b, a2, b2, x, y) (2 * a * b + y)
+#endif
+#if 0
+#define MANDEL_FUNCTION_REAL(a, b, a2, b2, x, y) (a2 - b2 + x + 1)
+#define MANDEL_FUNCTION_IMAG(a, b, a2, b2, x, y) (2 * a * b + y)
+#endif
+#if 0
+#define MANDEL_FUNCTION_REAL(a, b, a2, b2, x, y) (a2 * a - 3 * a * b * b + x)
+#define MANDEL_FUNCTION_IMAG(a, b, a2, b2, x, y) (3 * a * a * b - b2 * b + y)
+#endif
+
 #define MANDEL_GENERIC(dst, ctx, par, ix, iy, idx) { \
 		const double x = (par).cx - (par).imgw / 2 + (par).imgw * (ix) / ((ctx).w-1); \
 		const double y = (par).cy - (ctx).imgh / 2 + (ctx).imgh * (iy) / ((ctx).h-1); \
 		double a = x, b = y, a2 = a * a, b2 = b * b; \
 		int16_t iter, maxiter = (par).maxit; \
 		for (iter = 0; iter < maxiter && a2 + b2 < 4; iter++) { \
-			b = 2 * a * b + y; a = a2 - b2 + x; \
+			double newa = MANDEL_FUNCTION_REAL(a, b, a2, b2, x, y); \
+			double newb = MANDEL_FUNCTION_IMAG(a, b, a2, b2, x, y); \
+			a = newa; b = newb; \
 			a2 = a * a; b2 = b * b; \
 		} \
 		(dst)[idx] = iter; \
 	}
 
-__global__ void mandel_gpu(int16_t *dst, const Mandel *ctx, const Params *par) {
-	const int idx = blockDim.x * blockIdx.x + threadIdx.x;
-	const int ix = idx % (int)ctx->w, iy = ctx->h - 1 - idx / (int)ctx->w;
+__global__ void mandel_gpu(int16_t *dst, const Mandel *ctx, const Params *par, int subsample) {
+	const int subidx = blockDim.x * blockIdx.x + threadIdx.x;
+	const int subw = (ctx->w + subsample - 1) / subsample;
+	const int subix = subidx % subw, subiy = subidx / subw;
+	const int ix = subix * subsample, iy = subiy * subsample;
+	const int idx = ctx->w * iy + ix;
+	const int riy = ctx->h - 1 - iy;
+
 	if (iy >= ctx->h) return;
 
-	MANDEL_GENERIC(dst, *ctx, *par, ix, iy, idx);
+	MANDEL_GENERIC(dst, *ctx, *par, ix, riy, idx);
+}
+
+__global__ void unsubsample_gpu(int16_t *dst, const Mandel *ctx, int subsample) {
+	const int idx = blockDim.x * blockIdx.x + threadIdx.x;
+	const int ix = idx % ctx->w, iy = idx / ctx->w;
+	const int cellix = ix / subsample * subsample, celliy = iy / subsample * subsample;
+
+	dst[ctx->w * iy + ix] = dst[ctx->w * celliy + cellix];
 }
 
 // Unused in GPU mode
 __attribute__((unused))
-static inline void mandel_cpu(int16_t *dst, const Mandel *ctx, const Params *par) {
-	int idx = 0;
-	for (int iy = 0; iy < ctx->h; iy++) {
-		for (int ix = 0; ix < ctx->w; ix++) {
-			MANDEL_GENERIC(dst, *ctx, *par, ix, iy, idx);
-			idx++;
+static inline void mandel_cpu(int16_t *dst, const Mandel *ctx, const Params *par, int subsample) {
+	for (int iy = 0; iy < ctx->h; iy += subsample) {
+		for (int ix = 0; ix < ctx->w; ix += subsample) {
+			int idx = ctx->w * iy + ix;
+			int riy = ctx->h - 1 - iy;
+			MANDEL_GENERIC(dst, *ctx, *par, ix, riy, idx);
+		}
+	}
+
+	if (subsample == 1) return;
+
+	for (int iy = 0; iy < ctx->h; iy += subsample) {
+		for (int ix = 0; ix < ctx->w; ix += subsample) {
+			const int16_t value = dst[ctx->w * iy + ix];
+
+			for (int subiy = 0; subiy < subsample; subiy++) {
+				for (int subix = 0; subix < subsample; subix++) {
+					dst[ctx->w * (iy + subiy) + ix + subix] = value;
+				}
+			}
 		}
 	}
 }
@@ -126,20 +169,28 @@ double mandel_imgh(const Mandel *ctx, const Params *par) {
 	return par->imgw * ctx->h / ctx->w;
 }
 
-void mandel_render(uint8_t *dst, Mandel *ctx, const Params *par) {
+void mandel_render(uint8_t *dst, Mandel *ctx, const Params *par, size_t subsample) {
 	ctx->imgh = mandel_imgh(ctx, par);
 
 	int64_t t1 = gettimestamp();
 
 #ifdef USE_GPU
+	assert((size_t)(int)subsample == subsample);
+
+	const int subfactor = subsample * subsample;
+
 	CUDA_CHECK(cudaMemcpy(ctx->devPar, par, sizeof(Params), cudaMemcpyHostToDevice));
 	CUDA_CHECK(cudaMemcpy(ctx->devCtx, ctx, sizeof(Mandel), cudaMemcpyHostToDevice));
 	const int nblocks = (ctx->w * ctx->h + 1023) / 1024;
-	mandel_gpu<<<nblocks, 1024>>>(ctx->devImg, ctx->devCtx, ctx->devPar);
+	mandel_gpu<<<nblocks/subfactor, 1024>>>(ctx->devImg, ctx->devCtx, ctx->devPar, subsample);
+
+	if (subsample > 1) {
+		unsubsample_gpu<<<nblocks, 1024>>>(ctx->devImg, ctx->devCtx, subsample);
+	}
 
 	CUDA_CHECK(cudaMemcpy(ctx->img, ctx->devImg, ctx->w * ctx->h * sizeof(int16_t), cudaMemcpyDeviceToHost));
 #else
-	mandel_cpu(ctx->img, ctx, par);
+	mandel_cpu(ctx->img, ctx, par, subsample);
 #endif
 
 	int64_t t2 = gettimestamp();
@@ -155,20 +206,41 @@ void mandel_render(uint8_t *dst, Mandel *ctx, const Params *par) {
 	int64_t t3 = gettimestamp();
 
 	cout << "gpu part: " << (t2 - t1) / 1000000.0 << " sec   "
-	     << "cpu part: " << (t3 - t2) / 1000000 << " sec" << endl;
+	     << "cpu part: " << (t3 - t2) / 1000000.0 << " sec" << endl;
 }
 
 #if 0
 int main() {
 	Mandel *ctx = mandel_init(1920, 1080);
 	Params par = mandel_default_params();
+
+#if 0
 	par.cx = -0.73844331961137488;
 	par.cy = -0.20921562151741355;
 	par.imgw = 9.6624448855068765e-08;
 	par.maxit = 20000;
+
 	uint8_t *img = new uint8_t[3 * 1920 * 1080];
 	mandel_render(img, ctx, &par);
 	mandel_free(ctx);
 	bmp_rgb_encode_file("out.bmp", img, 1920, 1080);
+#else
+	par.cx = -0.23845305789504126;
+	par.cy = 0.87200567648858607;
+	par.imgw = 3.5;  // zoom to 7.5815186948616694e-14
+	par.maxit = 6144;
+
+	uint8_t *img = new uint8_t[3 * 1920 * 1080];
+
+	for (int i = 0; i <= 141; i++) {
+		mandel_render(img, ctx, &par, 1);
+		char fname[64];
+		sprintf(fname, "outdir/out%03d.bmp", i);
+		bmp_rgb_encode_file(fname, img, 1920, 1080);
+		par.imgw *= 0.8;
+	}
+
+	mandel_free(ctx);
+#endif
 }
 #endif