#include #include #include #include "mandel.h" #include "lodepng.h" #include "bmp.h" #include "cplx.h" using namespace std; #define USE_GPU #define CUDA_CHECK(expr) { \ cudaError_t err_ = (expr); \ if (err_ != cudaSuccess) { \ cerr << "Cuda error: " << err_ << ": " << cudaGetErrorString(err_) << endl; \ cerr << "On " << __FILE__ << ":" << __LINE__ << ": " << #expr << endl; \ exit(1); \ } \ } struct Mandel { int w, h; int16_t *img; int16_t *devImg; Params *devPar; Mandel *devCtx; // yes double imgh; // transient; calculated from params each time }; Params mandel_default_params() { Params par; par.cx = -0.5; par.cy = 0.0; par.imgw = 3.5; par.maxit = 512; par.par1 = par.par2 = 0; return par; } Mandel* mandel_init(int w, int h) { Mandel *ctx = new Mandel; ctx->w = w; ctx->h = h; ctx->img = new int16_t[w * h]; #ifdef USE_GPU CUDA_CHECK(cudaMalloc(&ctx->devImg, w * h * sizeof(int16_t))); CUDA_CHECK(cudaMalloc(&ctx->devPar, sizeof(Params))); CUDA_CHECK(cudaMalloc(&ctx->devCtx, sizeof(Mandel))); #endif return ctx; } void mandel_free(Mandel *ctx) { delete[] ctx->img; #ifdef USE_GPU CUDA_CHECK(cudaFree(ctx->devImg)); CUDA_CHECK(cudaFree(ctx->devPar)); CUDA_CHECK(cudaFree(ctx->devCtx)); #endif delete ctx; } // These macros may assume that the parameters are simple variable names. #if 0 #define MANDEL_FUNCTION_REAL(a, b, a2, b2, x, y) (a2 - b2 + x) #define MANDEL_FUNCTION_IMAG(a, b, a2, b2, x, y) (2 * a * b + y) #endif #if 0 #define MANDEL_FUNCTION_REAL(a, b, a2, b2, x, y) (a2 - b2 + x + 1) #define MANDEL_FUNCTION_IMAG(a, b, a2, b2, x, y) (2 * a * b + y) #endif #if 0 #define MANDEL_FUNCTION_REAL(a, b, a2, b2, x, y) (a2 * a - 3 * a * b * b + x) #define MANDEL_FUNCTION_IMAG(a, b, a2, b2, x, y) (3 * a * a * b - b2 * b + y) #endif #if 1 #define MANDEL_FUNCTION_REAL(a, b, a2, b2, x, y) (par->par1 * 0.01 * complex_power<5>(a, b, a2, b2).x + (1.0 - par->par2 * 0.01) * complex_power<2>(a, b, a2, b2).x + x) #define MANDEL_FUNCTION_IMAG(a, b, a2, b2, x, y) (par->par1 * 0.01 * complex_power<5>(a, b, a2, b2).y + (1.0 - par->par2 * 0.01) * complex_power<2>(a, b, a2, b2).y + y) #endif #define MANDEL_GENERIC(dst, ctx, par, ix, iy, idx) { \ const double x = (par).cx - (par).imgw / 2 + (par).imgw * (ix) / ((ctx).w-1); \ const double y = (par).cy - (ctx).imgh / 2 + (ctx).imgh * (iy) / ((ctx).h-1); \ double a = x, b = y, a2 = a * a, b2 = b * b; \ int16_t iter, maxiter = (par).maxit; \ for (iter = 0; iter < maxiter && a2 + b2 < 4; iter++) { \ double newa = MANDEL_FUNCTION_REAL(a, b, a2, b2, x, y); \ double newb = MANDEL_FUNCTION_IMAG(a, b, a2, b2, x, y); \ a = newa; b = newb; \ a2 = a * a; b2 = b * b; \ } \ (dst)[idx] = iter; \ } __global__ void mandel_gpu(int16_t *dst, const Mandel *ctx, const Params *par, int subsample) { const int subidx = blockDim.x * blockIdx.x + threadIdx.x; const int subw = (ctx->w + subsample - 1) / subsample; const int subix = subidx % subw, subiy = subidx / subw; const int ix = subix * subsample, iy = subiy * subsample; const int idx = ctx->w * iy + ix; const int riy = ctx->h - 1 - iy; if (iy >= ctx->h) return; MANDEL_GENERIC(dst, *ctx, *par, ix, riy, idx); } __global__ void unsubsample_gpu(int16_t *dst, const Mandel *ctx, int subsample) { const int idx = blockDim.x * blockIdx.x + threadIdx.x; const int ix = idx % ctx->w, iy = idx / ctx->w; const int cellix = ix / subsample * subsample, celliy = iy / subsample * subsample; dst[ctx->w * iy + ix] = dst[ctx->w * celliy + cellix]; } // Unused in GPU mode __attribute__((unused)) static inline void mandel_cpu(int16_t *dst, const Mandel *ctx, const Params *par, int subsample) { for (int iy = 0; iy < ctx->h; iy += subsample) { for (int ix = 0; ix < ctx->w; ix += subsample) { int idx = ctx->w * iy + ix; int riy = ctx->h - 1 - iy; MANDEL_GENERIC(dst, *ctx, *par, ix, riy, idx); } } if (subsample == 1) return; for (int iy = 0; iy < ctx->h; iy += subsample) { for (int ix = 0; ix < ctx->w; ix += subsample) { const int16_t value = dst[ctx->w * iy + ix]; for (int subiy = 0; subiy < subsample; subiy++) { for (int subix = 0; subix < subsample; subix++) { dst[ctx->w * (iy + subiy) + ix + subix] = value; } } } } } static void hue2rgb(int hue, uint8_t *r, uint8_t *g, uint8_t *b) { const int X = (60 - abs(hue % 120 - 60)) * 255 / 60; const int C = 255; switch (hue / 60 % 6) { case 0: *r = C; *g = X; *b = 0; break; case 1: *r = X; *g = C; *b = 0; break; case 2: *r = 0; *g = C; *b = X; break; case 3: *r = 0; *g = X; *b = C; break; case 4: *r = X; *g = 0; *b = C; break; case 5: *r = C; *g = 0; *b = X; break; } } static void curve(int16_t iter, uint8_t *red, uint8_t *gre, uint8_t *blu, int16_t maxit) { #if 0 double x = (double)iter / maxit; *red = sqrt(x) * 255; *gre = (-x*x*x + x*x*3/2 + x/2) * 255; *blu = x*x * 255; #else hue2rgb(iter, red, gre, blu); #endif } static int64_t gettimestamp() { struct timeval tv; gettimeofday(&tv, NULL); return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; } double mandel_imgh(const Mandel *ctx, const Params *par) { return par->imgw * ctx->h / ctx->w; } void mandel_render(uint8_t *dst, Mandel *ctx, const Params *par, size_t subsample) { ctx->imgh = mandel_imgh(ctx, par); int64_t t1 = gettimestamp(); #ifdef USE_GPU assert((size_t)(int)subsample == subsample); const int subfactor = subsample * subsample; CUDA_CHECK(cudaMemcpy(ctx->devPar, par, sizeof(Params), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(ctx->devCtx, ctx, sizeof(Mandel), cudaMemcpyHostToDevice)); const int nblocks = (ctx->w * ctx->h + 1023) / 1024; mandel_gpu<<>>(ctx->devImg, ctx->devCtx, ctx->devPar, subsample); if (subsample > 1) { unsubsample_gpu<<>>(ctx->devImg, ctx->devCtx, subsample); } CUDA_CHECK(cudaMemcpy(ctx->img, ctx->devImg, ctx->w * ctx->h * sizeof(int16_t), cudaMemcpyDeviceToHost)); #else mandel_cpu(ctx->img, ctx, par, subsample); #endif int64_t t2 = gettimestamp(); for (int i = 0; i < ctx->w * ctx->h; i++) { if (ctx->img[i] == par->maxit) { dst[3*i] = dst[3*i+1] = dst[3*i+2] = 0; } else { curve(ctx->img[i], &dst[3*i], &dst[3*i+1], &dst[3*i+2], par->maxit); } } int64_t t3 = gettimestamp(); cout << "gpu part: " << (t2 - t1) / 1000000.0 << " sec " << "cpu part: " << (t3 - t2) / 1000000.0 << " sec" << endl; } #if 0 int main() { Mandel *ctx = mandel_init(1920, 1080); Params par = mandel_default_params(); #if 0 par.cx = -0.73844331961137488; par.cy = -0.20921562151741355; par.imgw = 9.6624448855068765e-08; par.maxit = 20000; uint8_t *img = new uint8_t[3 * 1920 * 1080]; mandel_render(img, ctx, &par); mandel_free(ctx); bmp_rgb_encode_file("out.bmp", img, 1920, 1080); #else par.cx = -0.23845305789504126; par.cy = 0.87200567648858607; par.imgw = 3.5; // zoom to 7.5815186948616694e-14 par.maxit = 6144; uint8_t *img = new uint8_t[3 * 1920 * 1080]; for (int i = 0; i <= 5 * 141; i++) { mandel_render(img, ctx, &par, 1); char fname[64]; sprintf(fname, "outdir/out%03d.png", i); int64_t t1 = gettimestamp(); assert(lodepng::encode(fname, img, 1920, 1080, LCT_RGB) == 0); // bmp_rgb_encode_file(fname, img, 1920, 1080); int64_t t2 = gettimestamp(); cout << "file writing: " << (t2 - t1) / 1000000.0 << " sec" << endl; par.imgw *= pow(0.8, 1.0 / 5.0); } mandel_free(ctx); #endif } #endif